Anonymous avatar Anonymous committed 216c32d

Miscellaneous cleanups; added expected-next-symbols to parser

Comments (0)

Files changed (5)

 (in-package :hh-parse)
 
- #|
-Notation for extended grammar:
-
-(rule-name alt1 alt2 ...)
-
-where each term alt1/alt2 etc.s is a list of either symbols, literals, or lists with any of 
-the following as heads:
-
-+ repeat terms 1 or more times
-* repeat terms 0 or more times
-? optional terms: appear 0 or 1 times
-^ alternative terms : choose 1 term to appear
-
-also: 
-
-() around terms represents a logical grouping (that is, if the head
-  is none of the above symbols)
-
-All of these reduce to a more fundamental grammar (similar to CL-YACC) of:
-
-(rule-name alt1 alt2 ...)
-
-where 
-
-rule = symbol identifying name of rule
-alt1/alt2 = individual terms that are either names of other rules, or lists, where
-  each list is a sequence of other rule names
-
-Transforming the extended grammar into the fundamental grammar involves the following:
-
-* Treat the head of each rule as its name; set that aside
-* For each item in the tail of rule, treat each item as a discrete alternate right-hand side that is possible 
-  (e.g., a ;; rhs
-	 )
-* For any RHS that is not a list, make it a list
-* Convert () : For each RHS, walk the RHS and convert any list that does not have a recognized head (eg., see above)
-  into a reference to a new rule that has that list as a sequence in its single RHS
-* Convert * : For each occurence in a RHS of a list with * as the head, replace with (? (+ ...)) instead
-* Convert ? : For each occurrence of ? as the head of a list, convert the containing RHS into 2 separate RHS 
-  one with the rest of the list in place of the term containing ? and another without the ? terms at as
-  if it's not there
-* Convert + : For each occurrence of + , convert to a reference to a new rule that has 2 alternatives, one with
-  repetition, one without
-
-Terminology:
-
-* A grammar comprises one or more rules
-* A rule is a list whose car is a symbol for the rule name, and the cdr is a list of right-hand side alternatives
-* A right-hand side (or rhs) is a list, each element of which is either the symbolic name of a rule, a list
-  with one of the extended symbols above as it's head, or a literal (terminal)
-* A production is a logical idea only: it's the pairing of a rule name with 1 rhs (so a rule is just a
-  short-hand for a set of productions all with the same rule name)
-
-|#
-
 ;; ---------------------------------------------------------------------------------------------------------------------
 ;; Grammar specifications
 
 ;;
 
 (defmacro defgrammar (name start-rule-name &rest rules)
+  "Defines a grammar, later accessible by calling a function named for the grammar, where the specification for
+  the grammar consists of one or more rules of the form:
+
+  ;; (rule-name alt1 alt2 ...)
+
+  where each term alt1/alt2 etc.s is a list of either symbols, literals, or lists with any of 
+  the following as heads:
+
+  + repeat terms 1 or more times
+  * repeat terms 0 or more times
+  ? optional terms: appear 0 or 1 times
+  ^ alternative terms : choose 1 term to appear
+
+  also: 
+
+  any () around terms represents a logical grouping (that is, if the head
+  is none of the above symbols)
+
+  All of these reduce to a more fundamental grammar (similar to CL-YACC) of:
+
+  ;; (rule-name alt1 alt2 ...)
+
+  where 
+
+  rule = symbol identifying name of rule
+  alt1/alt2 = each is a sequence (list) of other rule names or tokens
+
+  Transforming the extended grammar into the fundamental grammar involves the following:
+
+  * Treat the head of each rule as its name; set that aside
+  * For each item in the tail of rule, treat each item as a discrete alternate right-hand side that is possible
+  * For any RHS that is not a list, make it a list
+  * Convert () : For each RHS, walk the RHS and convert any list that does not have a recognized head (eg., see above)
+    into a reference to a new rule that has that list as a sequence in its single RHS
+  * Convert * : For each occurence in a RHS of a list with * as the head, replace with (? (+ ...)) instead
+  * Convert ? : For each occurrence of ? as the head of a list, convert the containing RHS into 2 separate RHS 
+    one with the rest of the list in place of the term containing ? and another without the ? terms at as
+    if it's not there
+  * Convert + : For each occurrence of + , convert to a reference to a new rule that has 2 alternatives, one with
+    repetition, one without
+
+  Terminology:
+
+  * A grammar comprises one or more rules
+  * A rule is a list whose car is a symbol for the rule name, and the cdr is a list of right-hand side alternatives
+  * A right-hand side (or rhs) is a list, each element of which is either the symbolic name of a rule, a list
+    with one of the extended symbols above as it's head, or a literal (terminal)
+  * A production is a logical idea only: it's the pairing of a rule name with 1 rhs (so a rule is just a
+    short-hand for a set of productions all with the same rule name)
+"
   `(let* ((specification (transform-extended-grammar-to-fundamental ',start-rule-name ',rules))
 	  (grammar (make-grammar specification)))
      (defun ,name ()

package-hh-parse.lisp

 
    ;; parsers
    #:make-parser
+   #:expected-next-symbols
    #:parse-token
    #:parse-input
    #:get-parse-result
    (lexer :initarg :lexer :accessor lexer)
    (stack :initform () :accessor stack)))
 
+(defun expected-next-symbols (parser)
+  "Given a parser's current state, return a list of valid symbols (terminal and non-terminal) that would advance the parse"
+  (let ((state (caar (stack parser)))
+	(actions (entries (actions (grammar parser))))
+	(expected ()))
+    (loop for (expected-state expected-symbol) being the hash-key of actions
+	 if (= state expected-state ) 
+	 do (push expected-symbol expected))
+    expected))
+
 (defun parse-token (parser token)
   "Advance the state of the parser by parsing a single token; does not assume token came from lexer"
   (let ((grammar (grammar parser)))
   (let ((lexer (lexer parser)))
     (loop for result = (parse-token parser (next-token lexer))
        while (equal :continue result)
-       finally (return (values result (get-parse-result parser))))))
+       finally (return (values result (get-parse-result parser))))))
+
+(defpackage :hh-parse-profile
+  (:use :cl :asdf :lisp-unit :hh-utils :hh-parse))
+
+(in-package :hh-parse-profile)
+
 ;; Note this file is not presently used by the package, but is included for 
 
 (sb-profile:profile "HH-PARSE")
 
 ;; TODO: Need to rethink how we profile, since grammars are now constructed at load time,
 ;; and that was the most expensive part of this code
-(let* ((grammar (html-grammar)) 
-       (source (make-instance 'source-code-file))
+
+(defgrammar html-grammar document
+  ( tag-name  (identifier) )
+  
+  ( attribute-name  (identifier) )
+
+  (integer (digit)
+	   (integer digit))
+  (numeric-value ( (? (^ plus minus)) integer (? decimal integer)))
+  (number (numeric-value))
+
+  (es (? ws))
+
+  ( quantity-value  (^ number
+		       ( number pct-symbol)
+		       ( number pct)
+		       ( number px) ) )
+
+  ( attribute-value  (^ quantity-value
+			string-value))
+
+  ( attribute  (attribute-name es equal-sign es attribute-value))
+
+  ( attribute-list  ( attribute (* ws attribute)))
+
+  ( start-tag  (lt es tag-name (? ws attribute-list) es gt))
+
+  ( end-tag  (lt es forward-slash es tag-name  gt))
+
+  ( single-tag  (lt es tag-name (? ws attribute-list) es forward-slash es gt))
+
+  ( tag  (^ single-tag
+	    (start-tag es (* (^ tag html-text)) es end-tag)))
+  (document (tag)))
+
+(deflexer html-lexer (:text)
+  (:tag #'digit-char-p digit)
+  (:tag #'alpha-char-p identifier :accumulate #'(lambda (nc)
+						  (and nc
+						       (or (digit-char-p nc) 
+							   (alpha-char-p nc)
+							   (equal #\_ nc)))))
+  (:text #\< lt :state :tag)
+  (:tag #\> gt :state :text)
+  (:tag #\+ plus)
+  (:tag #\- minus)
+  (:tag #\. decimal)
+  (:tag #\/ forward-slash)
+  (:tag #\% percent-symbol)
+  (:tag #\= equal-sign)
+  (:tag #'whitespace-p ws :accumulate #'whitespace-p)
+  (:text t html-text :accumulate #'(lambda (nc) (char/= #\< nc))))
+
+
+(let* ((grammar (html-grammar))
+       (source (make-source "<foo bar=1>borp whaple</foo>"))
        (lexer (make-instance 'html-lexer :source source))
        (parser (make-parser lexer grammar))) 
-  (setf (source-text source) "<foo bar=1>borp whaple</foo>")
   (parse-token parser (next-token lexer)))
 
 (sb-profile:report)
 (defpackage :hh-parse-tests
-  (:use :cl :asdf :lisp-unit :hh-utils :hh-parse)
-   )
+  (:use :cl :asdf :lisp-unit :hh-utils :hh-parse))
 
 (in-package :hh-parse-tests)
 
   ( attribute-value  (^ quantity-value
 			string-value))
 
-  ( attribute  (attribute-name 
-		es 
-		equal-sign
-		es 
-		attribute-value))
+  ( attribute  (attribute-name es equal-sign es attribute-value))
 
   ( attribute-list  ( attribute (* ws attribute)))
 
-  ( start-tag  ( lt 
-		 es 
-		 tag-name
-		 (? ws attribute-list)
-		 es 
-		 gt))
+  ( start-tag  (lt es tag-name (? ws attribute-list) es gt))
 
-  ( end-tag  ( lt es forward-slash es tag-name  gt))
+  ( end-tag  (lt es forward-slash es tag-name  gt))
 
-  ( single-tag  ( lt 
-		  es 
-		  tag-name
-		  (? ws attribute-list) 
-		  es 
-		  forward-slash 
-		  es 
-		  gt))
+  ( single-tag  (lt es tag-name (? ws attribute-list) es forward-slash es gt))
 
   ( tag  (^ single-tag
-	    (start-tag 
-	     es 
-	     (* (^ tag html-text))
-	     es 
-	     end-tag)))
+	    (start-tag es (* (^ tag html-text)) es end-tag)))
   (document (tag)))
 
 (deflexer html-lexer (:text)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.