;;; xmls ;;; a simple xml parser for common lisp ;;; author: Miles Egan ;;; see COPYING file for license information (defpackage xmls (:use :cl :cl-user) (:export node-name node-ns node-attrs node-children make-node parse toxml write-xml)) (in-package :xmls) ;;;----------------------------------------------------------------------------- ;;; GLOBAL SETTINGS ;;;----------------------------------------------------------------------------- (defvar *strip-comments* t) (defvar *compress-whitespace* t) (defvar *test-verbose* nil) (defvar *entities* #(("lt;" #\<) ("gt;" #\>) ("amp;" #\&) ("apos;" #\') ("quot;" #\"))) (eval-when (:compile-toplevel :load-toplevel :execute) (defvar *whitespace* (remove-duplicates '(#\Newline #\Space #\Tab #\Return #\Linefeed)))) (defvar *char-escapes* (let ((table (make-array 256 :element-type 'string :initial-element ""))) (declare (type vector *entities*)) (loop for code from 0 to 255 for char = (code-char code) for entity = (first (find char *entities* :test #'char= :key #'second)) do (setf (svref table code) (cond (entity (concatenate 'string "&" entity)) ((and (or (< code 32) (> code 126)) (not (= code 10)) (not (= code 9))) (format nil "&#x~x;" code)) (t (format nil "~x" char)))) finally (return table)) table)) ;;;----------------------------------------------------------------------------- ;;; CONDITIONS ;;;----------------------------------------------------------------------------- (define-condition xml-parse-error (error) ((line :initarg :line :reader error-line))) ;;;----------------------------------------------------------------------------- ;;; NODE INTERFACE ;;;----------------------------------------------------------------------------- (defun make-node (&key name ns attrs child children) "Convenience function for creating a new xml node." (list* (if ns (cons name ns) name) attrs (if child (list child) children))) (defun node-name (elem) (if (consp (car elem)) (caar elem) (car elem))) (defun node-ns (elem) (if (consp (car elem)) (cdar elem) nil)) (defun (setf node-ns) (ns elem) (setf (car elem) (cons (node-name elem) ns))) (defun node-attrs (elem) (second elem)) (defun (setf node-attrs) (attrs elem) (setf (second elem) attrs)) (defun node-children (elem) (cddr elem)) (defun (setf node-children) (children elem) (rplacd (cdr elem) children) (node-children elem)) ;;;----------------------------------------------------------------------------- ;;; UTILITY FUNCTIONS ;;;----------------------------------------------------------------------------- (defun compress-whitespace (str) (if *compress-whitespace* (progn (setf str (string-trim *whitespace* str)) (if (= 0 (length str)) nil str)) str)) (defun write-escaped (string stream) "Writes string to stream with all character entities escaped." (coerce string 'simple-base-string) (when (eq stream t) (setf stream *standard-output*)) (loop for char across string for esc = (svref *char-escapes* (char-code char)) do (write-sequence esc stream))) (defun make-extendable-string (&optional (size 10)) "Creates an adjustable string with a fill pointer." (make-array size :element-type 'character :adjustable t :fill-pointer 0)) (defun push-string (c string) "Shorthand function for adding characters to an extendable string." (vector-push-extend c string)) (defun translate-raw-value (raw-value) "Helper function for xml generation." (etypecase raw-value (string raw-value) (symbol (symbol-name raw-value)) (integer (format nil "~D" raw-value)) (float (format nil "~G" raw-value)))) (defun generate-xml (e s indent) "Renders a lisp node tree to an xml string stream." (if (> indent 0) (incf indent)) (etypecase e (list (progn (dotimes (i (* 2 (- indent 2))) (write-char #\Space s)) (format s "<~A~@[ xmlns=\"~A\"~]" (node-name e) (node-ns e)) (loop for a in (node-attrs e) do (progn (write-char #\Space s) (write-string (first a) s) (write-char #\= s) (write-char #\" s) (write-escaped (second a) s) (write-char #\" s)))) (if (null (node-children e)) (progn (write-string "/>" s) (if (> indent 0) (write-char #\Newline s))) (progn (write-char #\> s) (if (> indent 0) (write-char #\Newline s)) (mapcan (lambda (c) (generate-xml c s indent)) (node-children e)) (if (> indent 0) (progn (dotimes (i (* 2 (- indent 2))) (write-char #\Space s)))) (format s "" (node-name e)) (if (> indent 0) (write-char #\Newline s))))) (number (generate-xml (translate-raw-value e) s indent)) (symbol (generate-xml (translate-raw-value e) s indent)) (string (progn (if (> indent 0) (progn (dotimes (i (* 2 (- indent 2))) (write-char #\Space s)))) (write-escaped e s) (if (> indent 0) (write-char #\Newline s)))))) ;;;----------------------------------------------------------------------------- ;;; PARSER STATE & LOOKAHEAD ;;;----------------------------------------------------------------------------- (defstruct state "Represents parser state. Passed among rules to avoid threading issues." (got-doctype nil) (lines 1 :type integer) nsstack stream) (defun resolve-entity (ent) "Resolves the xml entity ENT to a character. Numeric entities are converted using CODE-CHAR, which only works in implementations that internally encode strings in US-ASCII, ISO-8859-1 or UCS." (declare (type simple-base-string ent)) (declare (type vector *entities*)) (or (and (>= (length ent) 2) (char= (char ent 0) #\#) (code-char (if (char= (char ent 1) #\x) (parse-integer ent :start 2 :end (- (length ent) 1) :radix 16) (parse-integer ent :start 1 :end (- (length ent) 1))))) (second (find ent *entities* :test #'string= :key #'first)) (error "Unable to resolve entity ~S" ent))) (declaim (inline peek-stream)) (defun peek-stream (stream) "Looks one character ahead in the input stream. Serves as a potential hook for character translation." (peek-char nil stream nil)) (defun read-stream (stream) "Reads a character from the stream, translating entities as it goes." (let ((c (read-char stream nil))) (if (and c (not (char= c #\&))) c (loop with ent = (make-extendable-string 5) for char = (read-char stream) do (push-string char ent) until (char= char #\;) finally (return (resolve-entity (coerce ent 'simple-string))))))) (define-symbol-macro next-char (peek-stream (state-stream s))) (defmacro eat () "Consumes one character from the input stream." `(read-char (state-stream s))) (defmacro match (&rest matchers) "Attempts to match the next input character with one of the supplied matchers." `(let ((c (peek-stream (state-stream s)))) (and (or ,@(loop for m in matchers collect (etypecase m (standard-char `(char= ,m c)) (symbol `(,m c))))) ;; cheat here a little bit - eat entire char entity instead ;; of peeked char (read-stream (state-stream s))))) (defmacro match-seq (&rest sequence) "Tries to match the supplied matchers in sequence with characters in the input stream." `(and ,@(loop for s in sequence collect `(match ,s)))) (defmacro match* (&rest sequence) "Matches any occurances of any of the supplied matchers." `(loop with data = (make-extendable-string 10) for c = (match ,@sequence) while c do (push-string c data) finally (return data))) (defmacro match+ (&rest sequence) "Matches one or more occurances of any of the supplied matchers." `(and (peek ,@sequence) (match* ,@sequence))) (defmacro peek (&rest matchers) "Looks ahead for an occurance of any of the supplied matchers." `(let ((c (peek-stream (state-stream s)))) (or ,@(loop for m in matchers collect (etypecase m (standard-char `(char= ,m c)) (symbol `(,m c))))))) (defmacro must (&rest body) "Throws a parse error if the supplied forms do not succeed." `(or (progn ,@body) (error 'xml-parse-error))) ;;;----------------------------------------------------------------------------- ;;; PARSER INTERNAL FUNCTIONS ;;;----------------------------------------------------------------------------- (defstruct element "Common return type of all rule functions." (type nil :type symbol) (val nil)) (defun resolve-namespace (elem env) "Maps the ns prefix to its associated url via the supplied ns env." (let ((ns (node-ns elem))) (dolist (e env) (let ((nsurl (assoc ns e :test #'string=))) (and nsurl (setf (node-ns elem) (cadr nsurl)) (return ns)))))) ;;;----------------------------------------------------------------------------- ;;; MATCH AND RULE BUILDING UTILITIES ;;;----------------------------------------------------------------------------- (defmacro defmatch (name &rest body) "Match definition macro that provides a common lexical environment for matchers." `(defun ,name (c) ,@body)) (defmacro defrule (name &rest body) "Rule definition macro that provides a common lexical environment for rules." `(defun ,name (s) ,@body)) (defmacro matchfn (name) "Convenience macro for creating an anonymous function wrapper around a matcher macro." `(lambda (s) (match ,name))) (defun none-or-more (s func) "Collects any matches of the supplied rule with the input stream." (declare (type function func)) (let ((val (funcall func s))) (if val (multiple-value-bind (res nextval) (none-or-more s func) (values res (cons val nextval))) (values t nil)))) (defun one-or-more (s func) "Collects one or more matches of the supplied rule with the input stream." (declare (type function func)) (let ((val (funcall func s))) (if val (multiple-value-bind (res nextval) (none-or-more s func) (declare (ignore res)) (cons val nextval)) nil))) ;;;----------------------------------------------------------------------------- ;;; MATCHERS ;;;----------------------------------------------------------------------------- (defmatch digit () (and c (digit-char-p c))) (defmatch letter () (and c (alpha-char-p c))) (defmatch ws-char () (case c (#.*whitespace* t) (t nil))) (defmatch namechar () (or (and c (alpha-char-p c)) (and c (digit-char-p c)) (case c ((#\. #\- #\_ #\:) t)))) (defmatch ncname-char () (or (and c (alpha-char-p c)) (and c (digit-char-p c)) (case c ((#\. #\- #\_) t)))) (defmatch attr-text-dq () (and c (not (member c (list #\< #\"))))) (defmatch attr-text-sq () (and c (not (member c (list #\< #\'))))) (defmatch chardata () (and c (not (char= c #\<)))) (defmatch comment-char () (and c (not (eql c #\-)))) ;;;----------------------------------------------------------------------------- ;;; RULES ;;;----------------------------------------------------------------------------- (defrule ncname () (and (peek letter #\_) (match+ ncname-char))) (defrule qname () (let (name suffix) (and (setf name (ncname s)) (or (and (match #\:) (setf suffix (ncname s))) t)) (values name suffix))) (defrule attr-or-nsdecl () (let (suffix name val) (and (setf (values name suffix) (qname s)) (or (and (progn (match* ws-char) (match #\=)) (or (and (progn (match* ws-char) (match #\")) (setf val (match* attr-text-dq)) (match #\")) (and (progn (match* ws-char) (match #\')) (setf val (match* attr-text-sq)) (match #\')))) t) (if (string= "xmlns" name) (list 'nsdecl suffix val) (list 'attr (or suffix name) val))))) (defrule ws () (and (match+ ws-char) (make-element :type 'whitespace :val nil))) (defrule name () (and (peek namechar #\_ #\:) (match* namechar))) (defrule ws-attr-or-nsdecl () (and (ws s) (attr-or-nsdecl s))) (defrule start-tag () (let (name suffix attrs nsdecls) (and (peek namechar) (setf (values name suffix) (qname s)) (multiple-value-bind (res a) (none-or-more s #'ws-attr-or-nsdecl) (mapcar (lambda (x) (if (eq (car x) 'attr) (push (cdr x) attrs) (push (cdr x) nsdecls))) a) res) (or (ws s) t) (values (make-node :name (or suffix name) :ns (and suffix name) :attrs attrs) nsdecls)))) (defrule end-tag () (let (name suffix) (and (match #\/) (setf (values name suffix) (qname s)) (or (ws s) t) (match #\>) (make-element :type 'end-tag :val (or suffix name))))) (defrule comment () (and (match-seq #\! #\- #\-) (progn (loop until (match-seq #\- #\- #\>) do (eat)) t) (make-element :type 'comment))) (defrule comment-or-cdata () (and (peek #\!) (must (or (comment s) (and (match-seq #\[ #\C #\D #\A #\T #\A #\[) (loop with data = (make-extendable-string 50) with state = 0 do (case state (0 (if (match #\]) (incf state) (push-string (eat) data))) (1 (if (match #\]) (incf state) (progn (setf state 0) (push-string #\] data) (push-string (eat) data)))) (2 (if (match #\>) (incf state) (progn (setf state 0) (push-string #\] data) (push-string #\] data) (push-string (eat) data))))) until (eq state 3) finally (return (make-element :type 'cdata :val (coerce data 'simple-base-string))))))))) (declaim (ftype function element)) ; forward decl for content rule (defrule content () (if (match #\<) (must (or (comment-or-cdata s) (element s) (end-tag s))) (or (let (content) (and (setf content (match+ chardata)) (make-element :type 'data :val (compress-whitespace content))))))) (defrule element () (let (elem children nsdecls end-name) (and ;; parse front end of tag (multiple-value-bind (e n) (start-tag s) (setf elem e) (setf nsdecls n) e) ;; resolve namespaces *before* parsing children (if nsdecls (push nsdecls (state-nsstack s)) t) (or (if (or nsdecls (state-nsstack s)) (resolve-namespace elem (state-nsstack s))) t) ;; parse end-tag and children (or (match-seq #\/ #\>) (and (match #\>) (loop for c = (content s) while c do (etypecase c (element (case (element-type c) ('end-tag (return (setf end-name (element-val c)))) (t (if (element-val c) (push (element-val c) children))))))) (string= (node-name elem) end-name))) ;; package up new node (progn (setf (node-children elem) (nreverse children)) (make-element :type 'elem :val elem))))) (defrule processing-instruction-or-xmldecl () (let (name) (and (match #\?) (setf name (name s)) (none-or-more s #'ws-attr-or-nsdecl) (match-seq #\? #\>) (make-element :type 'pi :val name)))) (defrule processing-instruction () (let ((p (processing-instruction-or-xmldecl s))) (and p (not (string= (element-val p) "xml")) p))) (defrule xmldecl () (let ((p (processing-instruction-or-xmldecl s))) (and p (string= (element-val p) "xml") p))) (defrule comment-or-doctype () ;; skip dtd - bail out to comment if it's a comment ;; only match doctype once (and (peek #\!) (or (comment s) (and (not (state-got-doctype s)) (must (match-seq #\D #\O #\C #\T #\Y #\P #\E)) (loop with level = 1 do (case (eat) (#\> (decf level)) (#\< (incf level))) until (eq level 0) finally (return t)) (setf (state-got-doctype s) t) (make-element :type 'doctype))))) (defrule misc () (or (ws s) (and (match #\<) (must (or (processing-instruction s) (comment-or-doctype s) (element s)))))) (defrule document () (let (elem) (if (match #\<) (must (or (processing-instruction-or-xmldecl s) (comment-or-doctype s) (setf elem (element s))))) (unless elem (loop for c = (misc s) while c do (if (eql (element-type c) 'elem) (return (setf elem c))))) (and elem (element-val elem)))) ;;;----------------------------------------------------------------------------- ;;; PUBLIC INTERFACE ;;;----------------------------------------------------------------------------- (defun write-xml (e s &key (indent nil)) "Renders a lisp node tree to an xml stream. Indents if indent is non-nil." (generate-xml e s (if indent 1 0))) (defun toxml (e &key (indent nil)) "Renders a lisp node tree to an xml string." (with-output-to-string (s) (write-xml e s :indent indent))) (defun parse (s &key (compress-whitespace t)) "Parses the supplied stream or string into a lisp node tree." (let ((*compress-whitespace* compress-whitespace) (stream (etypecase s (string (make-string-input-stream s)) (stream s)))) (handler-case (document (make-state :stream stream)) (end-of-file () nil) (xml-parse-error () nil)))) ;;(trace end-tag comment comment-or-doctype content name xmldecl misc) ;;(trace processing-instruction processing-instruction-or-xmldecl element start-tag ws element-val) #+(or sbcl cmu allegro) (defun test () ;;(sb-profile:profile "XMLS") #+cmu(extensions:gc-off) ;; too noisy (dolist (test (cdr #+sbcl sb-ext:*posix-argv* #+cmu (subseq extensions:*command-line-strings* 4) #+allegro (sys:command-line-arguments))) (if *test-verbose* (format t "~A~%" (toxml (parse (open test) :compress-whitespace t) :indent t)) (progn (format t "~40A" (concatenate 'string test "... ")) (if (parse (open test)) (format t "ok~%") (format t "failed!~%"))))) ;;(sb-profile:report) #+sbcl(sb-ext:quit) #+cmu(extensions:quit) #+allegro(excl:exit))