Source

HH-Parse / lexer.lisp

(in-package :hh-parse)

;; ---------------------------------------------------------------------------------------------------------------------
;;
;;  Conditions
;;
;; ---------------------------------------------------------------------------------------------------------------------

(define-condition lexer-error (error)
  ((lexer :initarg :lexer :accessor lexer)
   (unexpected-character :initform nil :initarg :character :accessor unexpected-character))
  (:report (lambda (condition stream)
	     (with-slots (source position state) (lexer condition)
	       (format stream "Unexpected character ~@[~s~] in state ~s at position (line=~a,column=~a)"
		       (unexpected-character condition)
		       state
		       (line-at position)
		       (column-at position))))))

;; ---------------------------------------------------------------------------------------------------------------------
;;
;;  Generics
;;
;; ---------------------------------------------------------------------------------------------------------------------

(defgeneric next-token (lexer)
  (:documentation "Return the next token from the stream, or :eof if at end"))

(defgeneric copy-lexer (lexer)
  (:documentation "Return a copy of the lexer with independent state such that invocations of next-token
   on the original and the copy progress independently"))

;; ---------------------------------------------------------------------------------------------------------------------
;;
;;  Functions + methods + macros
;;
;; ---------------------------------------------------------------------------------------------------------------------

;; Source code helpers
(defmethod source-text ((lexer lexer))
  (source-text (source lexer)))

(defmethod (setf source-text) (text (lexer lexer) )
  ;; Drop existing lines
  (setf (source-text (source lexer)) text))

(defmacro deflexer (name (&optional (initial-state nil)) &rest token-definitions)
  "Define a lexer whose class is the provided name, the initial state of the lexer is initial,
and token definitions are a set of rules defining the tokens recognized by the lexer.  Token definitions should be
in the following form:

  ;; (state character-test token-type &key ((:state next-state) nil) ((:accumulate accumulation-test) nil))

where forms WITH an accumulation test expand to:

  ;; ((and (equal ',state state) (funcall ,character-test c))
  ;;  (return-accumulated-token ',token-type ,accumulation-test))

and forms WITHOUT an accumulation test expand to:

 ;; ((and (equal ',state state) (funcall ,character-test c))
 ;;  (return-token ',token-type))

In both cases, if next-state is non-nil, then the expanded token rule will cause
the lexer to change state after recognizing the token.

One helpful feature (for some token types) is that if t is passed for character-test,
then the test matches any character--no need to pass in a full lambda just to always return t.

An additional helpful feature is that if character-test is a character constant
and not a function, then the character-test will be a simple lambda:

  ;; (lambda (c) (char= ,character-test c))

For character tests, the single argument passed into the test is the character
just read from the lexer's source.  For accumulation tests, the single argument
passed to the test is a lookahead: that is, if the accumulation test returns true, then this
character will be read from the lexer stream and consumed; if the accumulation
test returns false, the character is not consumed and accumulation of a token
completes. 


"
  (let ((token-rules 
	 (loop for token in token-definitions
	    collect (destructuring-bind (state character-test token-type &key ((:state next-state) nil) ((:accumulate accumulation-test) nil)) token
		      (let ((actual-character-test (cond ((equal t character-test)
							  t)
							 ((functionp character-test) `(funcall ,character-test c))
							 ((and (listp character-test) (equal 'lambda (car character-test)))
							  `(funcall ,character-test c))
							 ((and (listp character-test) (equal 'function (car character-test)))
							  `(funcall ,character-test c))
							 ((characterp character-test)
							  `(char= c ,character-test))
							 (t (error "Bad character test ~s in token definition: ~s~%" character-test token)))))
			(if accumulation-test
			    ;; with accumulation test
			    `((and (equal ,state state) ,actual-character-test)
			      ,(when next-state `(setf state ,next-state))
			      (return-accumulated-token ',token-type ,accumulation-test))

			    ;; without accumulation test
			    `((and (equal ,state state) ,actual-character-test)
			      ,(when next-state `(setf state ,next-state))
			      (return-token ',token-type))))))))
    `(progn
       (defclass ,name (lexer)
	 ((state :initform ,initial-state)))

       (defmethod next-token ((lexer ,name))
	 (with-slots (source position state) lexer
	   (let ((c (next-lex-character source position))
		 (token-value (make-array `(0) :element-type 'character :adjustable t :fill-pointer t)))
	     (labels ((accumulate (c)
			(vector-push-extend c token-value))
		      (return-accumulated-token (type test)
			(accumulate c)
			(loop for nc = (current-lex-character source position)
			   while (and nc (funcall test nc))
			   do (progn 
				(accumulate nc)
				(incf-lex-position source position))
			   finally (return (list type token-value))))
		      (return-token (type)
			(list type c)))
	       (when c
		 (cond ,@token-rules)))))))))

(defmethod copy-lexer ((lexer lexer))
  (with-slots (source position state value) lexer
    (make-instance (class-of lexer) :source source :position position :state state)))