From 557d4a4a7bf522e4f4c4635935e136fbdef9d4bd Mon Sep 17 00:00:00 2001 From: Matthias Benkard Date: Thu, 8 Oct 2009 16:30:08 +0200 Subject: Add xmls to the repository. Ignore-this: 7947f9f9c082d8a0ff7fd7e1b31754ae darcs-hash:87d1fa99e2c798638dcf1c5718904bb52215dd7f --- xmls/xmls.lisp | 631 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 631 insertions(+) create mode 100644 xmls/xmls.lisp (limited to 'xmls/xmls.lisp') diff --git a/xmls/xmls.lisp b/xmls/xmls.lisp new file mode 100644 index 0000000..2f65c1d --- /dev/null +++ b/xmls/xmls.lisp @@ -0,0 +1,631 @@ +;;; xmls +;;; a simple xml parser for common lisp +;;; author: Miles Egan +;;; see COPYING file for license information + +(defpackage xmls + (:use :cl :cl-user) + (:export node-name node-ns node-attrs node-children make-node parse toxml write-xml)) + +(in-package :xmls) + +;;;----------------------------------------------------------------------------- +;;; GLOBAL SETTINGS +;;;----------------------------------------------------------------------------- +(defvar *strip-comments* t) +(defvar *compress-whitespace* t) +(defvar *test-verbose* nil) +(defvar *entities* + #(("lt;" #\<) + ("gt;" #\>) + ("amp;" #\&) + ("apos;" #\') + ("quot;" #\"))) +(eval-when (:compile-toplevel :load-toplevel :execute) + (defvar *whitespace* (remove-duplicates + '(#\Newline #\Space #\Tab #\Return #\Linefeed)))) +(defvar *char-escapes* + (let ((table (make-array 256 :element-type 'string :initial-element ""))) + (declare (type vector *entities*)) + (loop + for code from 0 to 255 + for char = (code-char code) + for entity = (first (find char *entities* :test #'char= :key #'second)) + do (setf (svref table code) + (cond + (entity + (concatenate 'string "&" entity)) + ((and (or (< code 32) (> code 126)) + (not (= code 10)) + (not (= code 9))) + (format nil "&#x~x;" code)) + (t + (format nil "~x" char)))) + finally (return table)) + table)) + +;;;----------------------------------------------------------------------------- +;;; CONDITIONS +;;;----------------------------------------------------------------------------- +(define-condition xml-parse-error (error) + ((line :initarg :line + :reader error-line))) + +;;;----------------------------------------------------------------------------- +;;; NODE INTERFACE +;;;----------------------------------------------------------------------------- +(defun make-node (&key name ns attrs child children) + "Convenience function for creating a new xml node." + (list* (if ns (cons name ns) name) + attrs + (if child + (list child) + children))) + +(defun node-name (elem) + (if (consp (car elem)) + (caar elem) + (car elem))) + +(defun node-ns (elem) + (if (consp (car elem)) + (cdar elem) + nil)) + +(defun (setf node-ns) (ns elem) + (setf (car elem) + (cons (node-name elem) ns))) + +(defun node-attrs (elem) (second elem)) + +(defun (setf node-attrs) (attrs elem) + (setf (second elem) attrs)) + +(defun node-children (elem) + (cddr elem)) + +(defun (setf node-children) (children elem) + (rplacd (cdr elem) children) + (node-children elem)) + +;;;----------------------------------------------------------------------------- +;;; UTILITY FUNCTIONS +;;;----------------------------------------------------------------------------- +(defun compress-whitespace (str) + (if *compress-whitespace* + (progn + (setf str (string-trim *whitespace* str)) + (if (= 0 (length str)) + nil + str)) + str)) + +(defun write-escaped (string stream) + "Writes string to stream with all character entities escaped." + (coerce string 'simple-base-string) + (when (eq stream t) (setf stream *standard-output*)) + (loop for char across string + for esc = (svref *char-escapes* (char-code char)) + do (write-sequence esc stream))) + +(defun make-extendable-string (&optional (size 10)) + "Creates an adjustable string with a fill pointer." + (make-array size + :element-type 'character + :adjustable t + :fill-pointer 0)) + +(defun push-string (c string) + "Shorthand function for adding characters to an extendable string." + (vector-push-extend c string)) + +(defun translate-raw-value (raw-value) + "Helper function for xml generation." + (etypecase raw-value + (string raw-value) + (symbol (symbol-name raw-value)) + (integer (format nil "~D" raw-value)) + (float (format nil "~G" raw-value)))) + +(defun generate-xml (e s indent) + "Renders a lisp node tree to an xml string stream." + (if (> indent 0) (incf indent)) + (etypecase e + (list + (progn + (dotimes (i (* 2 (- indent 2))) + (write-char #\Space s)) + (format s "<~A~@[ xmlns=\"~A\"~]" (node-name e) (node-ns e)) + (loop for a in (node-attrs e) + do (progn + (write-char #\Space s) + (write-string (first a) s) + (write-char #\= s) + (write-char #\" s) + (write-escaped (second a) s) + (write-char #\" s)))) + (if (null (node-children e)) + (progn + (write-string "/>" s) + (if (> indent 0) (write-char #\Newline s))) + (progn + (write-char #\> s) + (if (> indent 0) (write-char #\Newline s)) + (mapcan (lambda (c) (generate-xml c s indent)) (node-children e)) + (if (> indent 0) + (progn + (dotimes (i (* 2 (- indent 2))) + (write-char #\Space s)))) + (format s "" (node-name e)) + (if (> indent 0) (write-char #\Newline s))))) + (number + (generate-xml (translate-raw-value e) s indent)) + (symbol + (generate-xml (translate-raw-value e) s indent)) + (string + (progn + (if (> indent 0) + (progn + (dotimes (i (* 2 (- indent 2))) + (write-char #\Space s)))) + (write-escaped e s) + (if (> indent 0) (write-char #\Newline s)))))) + +;;;----------------------------------------------------------------------------- +;;; PARSER STATE & LOOKAHEAD +;;;----------------------------------------------------------------------------- +(defstruct state + "Represents parser state. Passed among rules to avoid threading issues." + (got-doctype nil) + (lines 1 :type integer) + nsstack + stream) + +(defun resolve-entity (ent) + "Resolves the xml entity ENT to a character. Numeric entities are +converted using CODE-CHAR, which only works in implementations that +internally encode strings in US-ASCII, ISO-8859-1 or UCS." + (declare (type simple-base-string ent)) + (declare (type vector *entities*)) + (or (and (>= (length ent) 2) + (char= (char ent 0) #\#) + (code-char + (if (char= (char ent 1) #\x) + (parse-integer ent :start 2 :end (- (length ent) 1) :radix 16) + (parse-integer ent :start 1 :end (- (length ent) 1))))) + (second (find ent *entities* :test #'string= :key #'first)) + (error "Unable to resolve entity ~S" ent))) + +(declaim (inline peek-stream)) +(defun peek-stream (stream) + "Looks one character ahead in the input stream. Serves as a potential hook for +character translation." + (peek-char nil stream nil)) + +(defun read-stream (stream) + "Reads a character from the stream, translating entities as it goes." + (let ((c (read-char stream nil))) + (if (and c (not (char= c #\&))) + c + (loop with ent = (make-extendable-string 5) + for char = (read-char stream) + do (push-string char ent) + until (char= char #\;) + finally (return (resolve-entity (coerce ent 'simple-string))))))) + +(define-symbol-macro next-char (peek-stream (state-stream s))) + +(defmacro eat () + "Consumes one character from the input stream." + `(read-char (state-stream s))) + +(defmacro match (&rest matchers) + "Attempts to match the next input character with one of the supplied matchers." + `(let ((c (peek-stream (state-stream s)))) + (and + (or ,@(loop for m in matchers + collect (etypecase m + (standard-char `(char= ,m c)) + (symbol `(,m c))))) + ;; cheat here a little bit - eat entire char entity instead + ;; of peeked char + (read-stream (state-stream s))))) + +(defmacro match-seq (&rest sequence) + "Tries to match the supplied matchers in sequence with characters in the input stream." + `(and ,@(loop for s in sequence + collect `(match ,s)))) + +(defmacro match* (&rest sequence) + "Matches any occurances of any of the supplied matchers." + `(loop with data = (make-extendable-string 10) + for c = (match ,@sequence) + while c + do (push-string c data) + finally (return data))) + +(defmacro match+ (&rest sequence) + "Matches one or more occurances of any of the supplied matchers." + `(and (peek ,@sequence) + (match* ,@sequence))) + +(defmacro peek (&rest matchers) + "Looks ahead for an occurance of any of the supplied matchers." + `(let ((c (peek-stream (state-stream s)))) + (or ,@(loop for m in matchers + collect (etypecase m + (standard-char `(char= ,m c)) + (symbol `(,m c))))))) + +(defmacro must (&rest body) + "Throws a parse error if the supplied forms do not succeed." + `(or (progn ,@body) + (error 'xml-parse-error))) + +;;;----------------------------------------------------------------------------- +;;; PARSER INTERNAL FUNCTIONS +;;;----------------------------------------------------------------------------- +(defstruct element + "Common return type of all rule functions." + (type nil :type symbol) + (val nil)) + +(defun resolve-namespace (elem env) + "Maps the ns prefix to its associated url via the supplied ns env." + (let ((ns (node-ns elem))) + (dolist (e env) + (let ((nsurl (assoc ns e :test #'string=))) + (and nsurl + (setf (node-ns elem) (cadr nsurl)) + (return ns)))))) + +;;;----------------------------------------------------------------------------- +;;; MATCH AND RULE BUILDING UTILITIES +;;;----------------------------------------------------------------------------- +(defmacro defmatch (name &rest body) + "Match definition macro that provides a common lexical environment for matchers." + `(defun ,name (c) + ,@body)) + +(defmacro defrule (name &rest body) + "Rule definition macro that provides a common lexical environment for rules." + `(defun ,name (s) + ,@body)) + +(defmacro matchfn (name) + "Convenience macro for creating an anonymous function wrapper around a matcher macro." + `(lambda (s) (match ,name))) + +(defun none-or-more (s func) + "Collects any matches of the supplied rule with the input stream." + (declare (type function func)) + (let ((val (funcall func s))) + (if val + (multiple-value-bind (res nextval) + (none-or-more s func) + (values res (cons val nextval))) + (values t nil)))) + +(defun one-or-more (s func) + "Collects one or more matches of the supplied rule with the input stream." + (declare (type function func)) + (let ((val (funcall func s))) + (if val + (multiple-value-bind (res nextval) + (none-or-more s func) + (declare (ignore res)) + (cons val nextval)) + nil))) + +;;;----------------------------------------------------------------------------- +;;; MATCHERS +;;;----------------------------------------------------------------------------- +(defmatch digit () + (and c (digit-char-p c))) + +(defmatch letter () + (and c (alpha-char-p c))) + +(defmatch ws-char () + (case c + (#.*whitespace* t) + (t nil))) + +(defmatch namechar () + (or + (and c (alpha-char-p c)) + (and c (digit-char-p c)) + (case c + ((#\. #\- #\_ #\:) t)))) + +(defmatch ncname-char () + (or + (and c (alpha-char-p c)) + (and c (digit-char-p c)) + (case c + ((#\. #\- #\_) t)))) + +(defmatch attr-text-dq () + (and c (not (member c (list #\< #\"))))) + +(defmatch attr-text-sq () + (and c (not (member c (list #\< #\'))))) + +(defmatch chardata () + (and c (not (char= c #\<)))) + +(defmatch comment-char () + (and c (not (eql c #\-)))) + +;;;----------------------------------------------------------------------------- +;;; RULES +;;;----------------------------------------------------------------------------- +(defrule ncname () + (and (peek letter #\_) + (match+ ncname-char))) + +(defrule qname () + (let (name suffix) + (and + (setf name (ncname s)) + (or + (and + (match #\:) + (setf suffix (ncname s))) + t)) + (values name suffix))) + +(defrule attr-or-nsdecl () + (let (suffix name val) + (and + (setf (values name suffix) (qname s)) + (or + (and + (progn + (match* ws-char) + (match #\=)) + (or + (and + (progn + (match* ws-char) + (match #\")) + (setf val (match* attr-text-dq)) + (match #\")) + (and + (progn + (match* ws-char) + (match #\')) + (setf val (match* attr-text-sq)) + (match #\')))) + t) + (if (string= "xmlns" name) (list 'nsdecl suffix val) (list + 'attr (or suffix name) val))))) + +(defrule ws () + (and (match+ ws-char) + (make-element :type 'whitespace :val nil))) + +(defrule name () + (and + (peek namechar #\_ #\:) + (match* namechar))) + +(defrule ws-attr-or-nsdecl () + (and + (ws s) + (attr-or-nsdecl s))) + +(defrule start-tag () + (let (name suffix attrs nsdecls) + (and + (peek namechar) + (setf (values name suffix) (qname s)) + (multiple-value-bind (res a) + (none-or-more s #'ws-attr-or-nsdecl) + (mapcar (lambda (x) (if (eq (car x) 'attr) + (push (cdr x) attrs) + (push (cdr x) nsdecls))) + a) + res) + (or (ws s) t) + (values + (make-node + :name (or suffix name) + :ns (and suffix name) + :attrs attrs) + nsdecls)))) + +(defrule end-tag () + (let (name suffix) + (and + (match #\/) + (setf (values name suffix) (qname s)) + (or (ws s) t) + (match #\>) + (make-element :type 'end-tag :val (or suffix name))))) + +(defrule comment () + (and + (match-seq #\! #\- #\-) + (progn + (loop until (match-seq #\- #\- #\>) + do (eat)) + t) + (make-element :type 'comment))) + +(defrule comment-or-cdata () + (and + (peek #\!) + (must (or (comment s) + (and + (match-seq #\[ #\C #\D #\A #\T #\A #\[) + (loop with data = (make-extendable-string 50) + with state = 0 + do (case state + (0 (if (match #\]) + (incf state) + (push-string (eat) data))) + (1 (if (match #\]) + (incf state) + (progn + (setf state 0) + (push-string #\] data) + (push-string (eat) data)))) + (2 (if (match #\>) + (incf state) + (progn + (setf state 0) + (push-string #\] data) + (push-string #\] data) + (push-string (eat) data))))) + until (eq state 3) + finally (return (make-element + :type 'cdata + :val (coerce data 'simple-base-string))))))))) + +(declaim (ftype function element)) ; forward decl for content rule +(defrule content () + (if (match #\<) + (must (or (comment-or-cdata s) + (element s) + (end-tag s))) + (or (let (content) + (and (setf content (match+ chardata)) + (make-element :type 'data :val (compress-whitespace content))))))) + +(defrule element () + (let (elem children nsdecls end-name) + (and + ;; parse front end of tag + (multiple-value-bind (e n) + (start-tag s) + (setf elem e) + (setf nsdecls n) + e) + ;; resolve namespaces *before* parsing children + (if nsdecls (push nsdecls (state-nsstack s)) t) + (or (if (or nsdecls (state-nsstack s)) + (resolve-namespace elem (state-nsstack s))) + t) + ;; parse end-tag and children + (or + (match-seq #\/ #\>) + (and + (match #\>) + (loop for c = (content s) + while c + do (etypecase c + (element (case (element-type c) + ('end-tag + (return (setf end-name (element-val c)))) + (t (if (element-val c) + (push (element-val c) children))))))) + (string= (node-name elem) end-name))) + ;; package up new node + (progn + (setf (node-children elem) (nreverse children)) + (make-element :type 'elem :val elem))))) + +(defrule processing-instruction-or-xmldecl () + (let (name) + (and + (match #\?) + (setf name (name s)) + (none-or-more s #'ws-attr-or-nsdecl) + (match-seq #\? #\>) + (make-element :type 'pi :val name)))) + +(defrule processing-instruction () + (let ((p (processing-instruction-or-xmldecl s))) + (and p + (not (string= (element-val p) "xml")) + p))) + +(defrule xmldecl () + (let ((p (processing-instruction-or-xmldecl s))) + (and p + (string= (element-val p) "xml") + p))) + +(defrule comment-or-doctype () + ;; skip dtd - bail out to comment if it's a comment + ;; only match doctype once + (and + (peek #\!) + (or (comment s) + (and (not (state-got-doctype s)) + (must (match-seq #\D #\O #\C #\T #\Y #\P #\E)) + (loop with level = 1 + do (case (eat) + (#\> (decf level)) + (#\< (incf level))) + until (eq level 0) + finally (return t)) + (setf (state-got-doctype s) t) + (make-element :type 'doctype))))) + +(defrule misc () + (or + (ws s) + (and (match #\<) (must (or (processing-instruction s) + (comment-or-doctype s) + (element s)))))) + +(defrule document () + (let (elem) + (if (match #\<) + (must (or (processing-instruction-or-xmldecl s) + (comment-or-doctype s) + (setf elem (element s))))) + (unless elem + (loop for c = (misc s) + while c do (if (eql (element-type c) 'elem) + (return (setf elem c))))) + (and elem (element-val elem)))) + +;;;----------------------------------------------------------------------------- +;;; PUBLIC INTERFACE +;;;----------------------------------------------------------------------------- +(defun write-xml (e s &key (indent nil)) + "Renders a lisp node tree to an xml stream. Indents if indent is non-nil." + (generate-xml e s (if indent 1 0))) + +(defun toxml (e &key (indent nil)) + "Renders a lisp node tree to an xml string." + (with-output-to-string (s) + (write-xml e s :indent indent))) + +(defun parse (s &key (compress-whitespace t)) + "Parses the supplied stream or string into a lisp node tree." + (let ((*compress-whitespace* compress-whitespace) + (stream + (etypecase s + (string (make-string-input-stream s)) + (stream s)))) + (handler-case + (document (make-state :stream stream)) + (end-of-file () nil) + (xml-parse-error () nil)))) + +;;(trace end-tag comment comment-or-doctype content name xmldecl misc) +;;(trace processing-instruction processing-instruction-or-xmldecl element start-tag ws element-val) + +#+(or sbcl cmu allegro) +(defun test () + ;;(sb-profile:profile "XMLS") + #+cmu(extensions:gc-off) ;; too noisy + (dolist (test (cdr + #+sbcl sb-ext:*posix-argv* + #+cmu (subseq extensions:*command-line-strings* 4) + #+allegro (sys:command-line-arguments))) + (if *test-verbose* + (format t "~A~%" (toxml (parse (open test) :compress-whitespace t) :indent t)) + (progn + (format t "~40A" (concatenate 'string test "... ")) + (if (parse (open test)) + (format t "ok~%") + (format t "failed!~%"))))) + ;;(sb-profile:report) + #+sbcl(sb-ext:quit) + #+cmu(extensions:quit) + #+allegro(excl:exit)) -- cgit v1.2.3