Commits

Robert Smith committed f3924f5

Add the first iteration of a Thompson-style regular expression matcher.

Comments (0)

Files changed (1)

thompson-nfa.lisp

+;;;; thompson-nfa.lisp
+;;;; Copyright (c) 2013 Robert Smith
+
+;;;; Example use:
+;;;;
+;;;; CL-USER> (regex-match-p `(:concat (:kleene (:either #\a #\b))
+;;;;                                   (:option #\c))
+;;;;                         "abababbbaabababbabababbabababac")
+;;;; T
+
+
+;;; Regular Expression Grammar:
+;;;
+;;; <regex> := <char>
+;;;          | (:CONCAT <regex> <regex>)
+;;;          | (:KLEENE <regex>)
+;;;          | (:REPEAT <regex>
+;;;          | (:OPTION <regex>)
+;;;          | (:EITHER <regex> <regex>)
+
+(defun pre->post (regex)
+  (let ((postfix nil))
+    (labels ((emit (x)
+               (push x postfix))
+             
+             (generate (regex)
+               (if (characterp regex)
+                   (emit regex)
+                   (let ((length (length regex)))
+                     (cond
+                       ((= 2 length) (progn
+                                       (generate (second regex))
+                                       (emit (first regex))))
+                       ((= 3 length) (progn
+                                       (generate (second regex))
+                                       (generate (third regex))
+                                       (emit (first regex))))
+                       (t (error "invalid regex ~S" regex)))))))
+      (generate regex)
+      (nreverse postfix))))
+
+;;; State
+
+(defvar *state-counter* 0)
+
+(defstruct state
+  (id (incf *state-counter*)))
+
+(defstruct (wire (:include state))
+  char
+  out)
+
+(defstruct (junction (:include state))
+  left
+  right)
+
+(defstruct (terminal (:include state)))
+
+(defvar *terminal* (make-terminal))
+
+(defconstant +detached+ :detached
+  "Denotes an arrow in the NFA pointing to nowhere.")
+
+;;; Fragment
+;;;
+;;; A fragment holds a state and any arrows that need to be connected.
+
+(defstruct fragment
+  state
+  arrows)
+
+(defun patch (arrows state)
+  "Connect all of the arrows ARROWS to the state STATE."
+  (dolist (arrow arrows)
+    (funcall arrow state)))
+
+;;; NFA Construction
+
+(defun make-nfa (instrs)
+  (let ((frags nil))
+    (labels ((push-frag (frag)
+               (push frag frags))
+             
+             (pop-frag ()
+               (pop frags))
+             
+             (process (instr)
+               (if (characterp instr)
+                   (let ((state (make-wire :char instr
+                                           :out +detached+)))
+                     (push-frag (make-fragment
+                                 :state state
+                                 :arrows (list (lambda (st)
+                                                 (setf (wire-out state) st))))))
+                   (case instr
+                     ((:CONCAT)
+                      (let* ((e2 (pop-frag))
+                             (e1 (pop-frag)))
+                        (patch (fragment-arrows e1)
+                               (fragment-state e2))
+                        (push-frag (make-fragment
+                                    :state (fragment-state e1)
+                                    :arrows (fragment-arrows e2)))))
+                     
+                     ((:KLEENE)
+                      (let* ((e (pop-frag))
+                             (s (make-junction
+                                 :left (fragment-state e)
+                                 :right +detached+)))
+                        (patch (fragment-arrows e)
+                               s)
+                        (push-frag
+                         (make-fragment
+                          :state s
+                          :arrows (list (lambda (st)
+                                          (setf (junction-right s) st)))))))
+                     
+                     ((:REPEAT)
+                      (let* ((e (pop-frag))
+                             (s (make-junction
+                                 :left (fragment-state e)
+                                 :right +detached+)))
+                        (patch (fragment-arrows e)
+                               s)
+                        (push-frag
+                         (make-fragment
+                          :state (fragment-state e)
+                          :arrows (list (lambda (st)
+                                          (setf (junction-right s) st)))))))
+                     
+                     ((:OPTION)
+                      (let* ((e (pop-frag))
+                             (s (make-junction
+                                 :left (fragment-state e)
+                                 :right +detached+)))
+                        (push-frag
+                         (make-fragment
+                          :state s
+                          :arrows (append
+                                   (fragment-arrows e)
+                                   (list (lambda (st)
+                                           (setf (junction-right s) st))))))))
+                     
+                     ((:EITHER)
+                      (let* ((e2 (pop-frag))
+                             (e1 (pop-frag))
+                             (s (make-junction
+                                 :left (fragment-state e1)
+                                 :right (fragment-state e2))))
+                        (push-frag (make-fragment
+                                    :state s
+                                    :arrows (append (fragment-arrows e1)
+                                                    (fragment-arrows e2))))))
+                     
+                     (otherwise (error "invalid instr ~S" instr))))))
+      (dolist (instr instrs)
+        (process instr))
+      
+      (assert (= 1 (length frags)))
+      
+      (let ((final (pop-frag)))
+        (patch (fragment-arrows final)
+               *terminal*)
+        
+        (fragment-state final)))))
+
+;;; NFA Simulation
+
+(defun simulate-nfa (state string)
+  (let ((current-states nil)
+        (next-states    nil))
+    (labels ((push-state (state)
+               ;;(format t "pushing state: ~S~%" state)
+               (cond
+                 ((eql state +detached+)
+                  (warn "Found detached arrow..."))
+                 
+                 ((junction-p state)
+                  (progn
+                    (push-state (junction-left state))
+                    (push-state (junction-right state))))
+                 
+                 (t (pushnew state next-states :key #'state-id
+                                               :test #'=))))
+             
+             (compute-next-states (char)
+               (dolist (state current-states)
+                 ;;(format t "looking at: ~S~%" state)
+                 (when (and (wire-p state)
+                            (char= char
+                                   (wire-char state)))
+                   (push-state (wire-out state)))))
+             
+             (rotate-states ()
+               (shiftf current-states
+                          next-states
+                          nil))
+             
+             (match-found-p (states)
+               ;; XXX: This does a simple pointer check. This could
+               ;; break if copying occurs!
+               (and (find *terminal* states :test #'eq)
+                    t)))
+      
+      ;; We use this hack so we do not need two PUSH-STATE functions.
+      (push-state state)
+      (rotate-states)
+      
+      (loop :for c :across string
+            :do (progn
+                  (compute-next-states c)
+                  (rotate-states))
+            :finally (return (match-found-p current-states))))))
+
+(defun regex-match-p (regex string)
+  "Does the string STRING match the regular expression REGEX (in
+  S-expression form)?"
+  (simulate-nfa (make-nfa (pre->post regex))
+                string))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.