Commits

Anonymous committed 594adda

Functional html lexer; lots of cleanup still to do, so work in progress

  • Participants
  • Parent commits bf0c59c
  • Branches parsing

Comments (0)

Files changed (4)

File hh-parse.asd

   :components (
                (:file "package-hh-parse")
 	       (:file "grammar")
+	       (:file "lexer")
 	       (:file "parser")
 	       (:file "source")
 	       (:file "samples")
+(in-package :hh-parse)
+
+;; ---------------------------------------------------------------------------------------------------------------------
+;;
+;;  Classes + types
+;;
+;; ---------------------------------------------------------------------------------------------------------------------
+
+(defclass lexer ()
+  ((source :initarg :source :accessor source)
+   (position :initform (make-instance 'source-code-position) :initarg :position :accessor parse-position)
+   (state :initform :any :accessor state)))
+
+(define-condition lexer-error (error)
+  ((lexer :initarg :lexer :accessor lexer))
+  (:report (lambda (condition stream)
+	     (with-slots (source position state) (lexer condition)
+	       (format stream "Lexer in state ~a encountered unexpected characters before ~a at position ~a"
+		       state
+		       (current-parse-character source position)
+		       position)))))
+
+;; ---------------------------------------------------------------------------------------------------------------------
+;;
+;;  Generics
+;;
+;; ---------------------------------------------------------------------------------------------------------------------
+
+(defgeneric next-token (lexer)
+  (:documentation "Return the next token from the stream, or :eof if at end"))
+
+(defgeneric copy-lexer (lexer)
+  (:documentation "Return a copy of the lexer with independent state such that invocations of next-token
+   on the original and the copy progress independently"))
+
+;; ---------------------------------------------------------------------------------------------------------------------
+;;
+;;  Functions + methods + macros
+;;
+;; ---------------------------------------------------------------------------------------------------------------------
+
+;; (defmacro deflexer (name char-var &rest body)
+;;   `(progn
+;;      (defclass ,name (lexer))
+
+;;      (defmethod next-token ((lexer ,name))
+;;        (with-slots (source position state)
+;; 	   (let ((,char-var (next-character source position state)))
+	     
+;; 	     ,@body)))))
+
+(defmethod copy-lexer ((lexer lexer))
+  (with-slots (source position state value) lexer
+    (make-instance (class-of lexer) :source source :position position :state state)))
+

File samples.lisp

 
 						    ( attribute-name  (identifier) )
 
-						    ( quantity-value  (^ number-value
-									 ( number-value pct-symbol)
-									 ( number-value pct)
-									 ( number-value px) ) )
+						    (integer (digit)
+							     (integer digit))
+						    (numeric-value ( (? (^ plus minus)) integer (? decimal integer)))
+						    (number (numeric-value))
+
+						    ( quantity-value  (^ number
+									 ( number pct-symbol)
+									 ( number pct)
+									 ( number px) ) )
 
 						    ( attribute-value  (^ quantity-value
 									  string-value))
 								    tag-name
 								    (? ws attribute-list) 
 								    es 
-								    fs 
+								    forward-slash 
 								    es 
 								    gt))
 
   (defun html-grammar ()
     html-grammar))
 
+(defclass html-lexer (lexer) 
+  ((state :initform :text)))
+
+(defmethod next-token ((lexer html-lexer))
+  (with-slots (source position state) lexer
+    (let ((c (next-parse-character source position))
+	  (token-value (make-array `(0) :element-type 'character :adjustable t :fill-pointer t)))
+      (labels ((accumulate (c)
+		 (vector-push-extend c token-value))
+	       (accumulate-while (type test)
+		 (accumulate c)
+		 (loop for nc = (current-parse-character source position)
+		    while (and nc (funcall test nc))
+		    do (progn 
+			 (accumulate nc)
+			 (incf-parse-position source position))
+		    finally (return (list type token-value)))))
+	(when c
+	  ;; TODO factor in state into each test, and
+	  ;; arrange in an order that makes sense (esp. for text)
+	  (cond ((and (equal :tag state) (digit-char-p c)) 
+		 (list 'digit c))
+		((and (equal :tag state) (alpha-char-p c))
+		 (accumulate-while 'identifier #'(lambda (nc)
+						   (and nc
+							(or (digit-char-p nc) 
+							    (alpha-char-p nc)
+							    (equal #\_ nc))))))
+		((and (equal :text state) (char= #\< c))
+		 (setf state :tag)
+		 (list 'lt c))
+		((and (equal :tag state) (char= #\> c))
+		 (setf state :text)
+		 (list 'gt c))
+		((and (equal :tag state) (char= #\+ c))
+		 (list 'plus c))
+		((and (equal :tag state) (char= #\- c))
+		 (list 'minus c))
+		((and (equal :tag state) (char= #\. c))
+		 (list 'decimal c))
+		((and (equal :tag state) (char= #\/ c))
+		 (list 'forward-slash c))
+		((and (equal :tag state) (char= #\% c))
+		 (list 'percent-symbol c))
+		((and (equal :tag state) (char= #\= c))
+		 (list 'equal-sign c))
+		((and (equal :tag state) (whitespace-p c))
+		 (accumulate-while 'ws #'whitespace-p))
+		((equal :text state)
+		 (accumulate-while 'html-text #'(lambda (nc) (char/= #\< nc))))
+		(t (error 'lexer-error :lexer lexer))
+		))))))
 
 (defgeneric parse-position ( code ) )
 
-(defgeneric (setf parse-position ) ( position code ) )
+(defgeneric (setf parse-position) ( position code ) )
+
+(defgeneric current-parse-character ( source position )
+  (:documentation "Return the character at the current position in the source"))
+
+(defgeneric incf-parse-position (source position)
+  (:documentation "Increment the position to the next character in source, adjusting for line breaks"))
+
+(defgeneric next-parse-character ( source position)
+  (:documentation "Return the current character, and advance the position in the source"))
 
 (defgeneric next-column ( position )
   (:documentation "Increment the column component of position"))
 ;; =====================================================================================================================
 
 (defmethod (setf current-position) :around ( (source source-code-file) (position source-code-position) )
-  ;; validate that the line and column are in bounds (<= # of lines, <= # of columns in line)
+  ;; TODO validate that the line and column are in bounds (<= # of lines, <= # of columns in line)
   (let ((new-position (call-next-method) ))
     new-position))
 
   (setf (current-line source) (make-instance 'source-code-line :text line) ))
 
 (defmethod current-line-position ( (source source-code-file) )
-  (line-at (current-position source) ))
+  (current-line-position (current-position source)))
 
-(defmethod (setf current-line-position) ( line source)
-  (setf (line-at (current-position source) ) line))
+(defmethod current-line-position ( (position source-code-position))
+  (line-at position))
+
+(defmethod (setf current-line-position) ( line (source source-code-file))
+  (setf (current-line-position (current-position source)) line))
+
+(defmethod (setf current-line-position) ( line (position source-code-position))
+  (setf (line-at position ) line))
 
 (defmethod current-column-position ( (source source-code-file) )
-  (column-at (current-position source) ))
+  (current-column-position (current-position source)))
 
-(defmethod (setf current-column-position) ( column source)
-  (setf (column-at (current-position source) ) column))
+(defmethod current-column-position ( (position source-code-position) )
+  (column-at position))
+
+(defmethod (setf current-column-position) ( column (source source-code-file))
+  (setf (current-column-position (current-position source) ) column))
+
+(defmethod (setf current-column-position) ( column (position source-code-position))
+  (setf (column-at position ) column))
 
 (defmethod next-column ( (position source-code-position) )
   (incf (column-at position) ))
        (equal (column-at left) (column-at right))))
 
 (defmethod parse-position ( (code source-code-file) )
-  (current-position code))
+  "Return a copy of the current position"
+  (make-instance 'source-code-position 
+		 :column (current-column-position code)
+		 :line (current-line-position code)))
 
-(defmethod (setf parse-position ) ( position (code source-code-file) )
-  (setf (current-position code) position))
+(defmethod (setf parse-position) ( position (code source-code-file) )
+  "Adjust the current position; do not retain the provided position object"
+  (setf (current-column-position code) (column-at position))
+  (setf (current-line-position code) (line-at position)))
+
+(defmethod current-parse-character ( (source source-code-file) (position source-code-position))
+  (let ((line-pos (current-line-position position)))
+    ;; TODO we're only checking line position right now,
+    ;; because if using next-character, then should never
+    ;; have column-position of end of the current-line.
+    ;; Of course, since it is possible to explicitly set
+    ;; the column position then the assumption we make
+    ;; here may not be safe
+    (when (< line-pos (length (lines-of source)))   
+      (let* ((line (elt (lines-of source) (current-line-position position)))
+	     (text (text-of line))
+	     (pos (current-column-position position)))
+	(elt text pos)))))
+
+(defmethod incf-parse-position ( (source source-code-file) (position source-code-position) )
+  (let ((line-pos (current-line-position position))
+	(column-pos (current-column-position position))
+	(line (elt (lines-of source) (current-line-position position))))
+    (if (< (1+ column-pos) (length (text-of line)))
+	;; next column on same line
+	(setf (current-column-position position) (1+ column-pos))
+	;; go to start of next line
+	(progn
+	  (setf (current-line-position position) (1+ line-pos))
+	  (setf (current-column-position position) 0)))))
+
+(defmethod next-parse-character ( (source source-code-file) (position source-code-position))
+  (let ((c (current-parse-character source position)))
+    (when c
+      ;; only increment if wasn't already at end
+      (incf-parse-position source position))
+    c))
 
 (defmethod flush-parsers ( (source source-code-file) &optional (line nil) )
   (loop for i from (or line (current-line-position source) ) below (length (lines-of source) )