Source

HH-Parse / lexer.lisp

Full commit
;; Copyright (c) 2010 Phil Hargett

;; Permission is hereby granted, free of charge, to any person obtaining a copy
;; of this software and associated documentation files (the "Software"), to deal
;; in the Software without restriction, including without limitation the rights
;; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
;; copies of the Software, and to permit persons to whom the Software is
;; furnished to do so, subject to the following conditions:

;; The above copyright notice and this permission notice shall be included in
;; all copies or substantial portions of the Software.

;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
;; THE SOFTWARE.

(in-package :hh-parse)

;; ---------------------------------------------------------------------------------------------------------------------
;;
;;  Conditions
;;
;; ---------------------------------------------------------------------------------------------------------------------

(define-condition lexer-error (error)
  ((lexer :initarg :lexer :accessor lexer)
   (unexpected-character :initform nil :initarg :character :accessor unexpected-character))
  (:report (lambda (condition stream)
	     (with-slots (source position state) (lexer condition)
	       (format stream "Unexpected character ~@[~s~] in state ~s at position (line=~a,column=~a)"
		       (unexpected-character condition)
		       state
		       (line-at position)
		       (column-at position))))))

;; ---------------------------------------------------------------------------------------------------------------------
;;
;;  Generics
;;
;; ---------------------------------------------------------------------------------------------------------------------

(defgeneric next-token (lexer)
  (:documentation "Return the next token from the stream, or :eof if at end"))

(defgeneric copy-lexer (lexer)
  (:documentation "Return a copy of the lexer with independent state such that invocations of next-token
   on the original and the copy progress independently"))

;; ---------------------------------------------------------------------------------------------------------------------
;;
;;  Functions + methods + macros
;;
;; ---------------------------------------------------------------------------------------------------------------------

;; Source code helpers
(defmethod source-text ((lexer lexer))
  (source-text (source lexer)))

(defmethod (setf source-text) (text (lexer lexer) )
  ;; Drop existing lines
  (setf (source-text (source lexer)) text))

(defmacro deflexer (name (&optional (initial-state nil)) &rest token-definitions)
  "Define a lexer whose class is the provided name, the initial state of the lexer is initial,
and token definitions are a set of rules defining the tokens recognized by the lexer.  Token definitions should be
in the following form:

  ;; (state character-test token-type &key ((:state next-state) nil) ((:accumulate accumulation-test) nil))

where forms WITH an accumulation test expand to:

  ;; ((and (equal ',state state) (funcall ,character-test c))
  ;;  (return-accumulated-token ',token-type ,accumulation-test))

and forms WITHOUT an accumulation test expand to:

 ;; ((and (equal ',state state) (funcall ,character-test c))
 ;;  (return-token ',token-type))

In both cases, if next-state is non-nil, then the expanded token rule will cause
the lexer to change state after recognizing the token.

One helpful feature (for some token types) is that if t is passed for character-test,
then the test matches any character--no need to pass in a full lambda just to always return t.

An additional helpful feature is that if character-test is a character constant
and not a function, then the character-test will be a simple lambda:

  ;; (lambda (c) (char= ,character-test c))

For character tests, the single argument passed into the test is the character
just read from the lexer's source.  For accumulation tests, the single argument
passed to the test is a lookahead: that is, if the accumulation test returns true, then this
character will be read from the lexer stream and consumed; if the accumulation
test returns false, the character is not consumed and accumulation of a token
completes. 


"
  (let ((token-nodes (loop for token in token-definitions
			collect (destructuring-bind (state character-test token-type &key ((:state next-state) nil) ((:accumulate accumulation-test) nil)) token
				  (declare (ignorable state character-test next-state accumulation-test))
				  `(defclass ,token-type (ast-node) ()))))
	(token-rules 
	 (loop for token in token-definitions
	    collect (destructuring-bind (state character-test token-type &key ((:state next-state) nil) ((:accumulate accumulation-test) nil)) token
		      (let ((actual-character-test (cond ((equal t character-test)
							  t)
							 ((functionp character-test) `(funcall ,character-test c))
							 ((and (listp character-test) (equal 'lambda (car character-test)))
							  `(funcall ,character-test c))
							 ((and (listp character-test) (equal 'function (car character-test)))
							  `(funcall ,character-test c))
							 ((characterp character-test)
							  `(char= c ,character-test))
							 (t (error "Bad character test ~s in token definition: ~s~%" character-test token)))))
			(if accumulation-test
			    ;; with accumulation test
			    `((and (equal ,state state) ,actual-character-test)
			      ,(when next-state `(setf state ,next-state))
			      (return-accumulated-token ',token-type ,accumulation-test))

			    ;; without accumulation test
			    `((and (equal ,state state) ,actual-character-test)
			      ,(when next-state `(setf state ,next-state))
			      (return-token ',token-type))))))))
    `(progn

       ,@token-nodes

       (defclass ,name (lexer)
	 ((state :initform ,initial-state)))

       (defmethod next-token ((lexer ,name))
	 (with-slots (source position state) lexer
	   (let ((c (next-lex-character source position))
		 (token-value (make-array `(0) :element-type 'character :adjustable t :fill-pointer t)))
	     (labels ((accumulate (c)
			(vector-push-extend c token-value))
		      (return-accumulated-token (type test)
			(accumulate c)
			(loop for nc = (current-lex-character source position)
			   while (and nc (funcall test nc))
			   do (progn 
				(accumulate nc)
				(incf-lex-position source position))
			   finally (return (list type (make-instance type :value token-value)))))
		      (return-token (type)
			(list type (make-instance type :value c))))
	       (when c
		 (cond ,@token-rules)))))))))

(defmethod copy-lexer ((lexer lexer))
  (with-slots (source position state value) lexer
    (make-instance (class-of lexer) :source source :position position :state state)))