Commits

Devin Jeanpierre  committed 2a4f816

First step towards JITing: extension module-ing.

Writing a JIT involves interop with LLVM, which means writing C++ code.
Therefore, as a trial run, I'll write an interpreter in C first.

The only thing left to do is support bytes that happen to exist as
unicode. How awful!

  • Participants
  • Parent commits 346739e
  • Branches jit

Comments (0)

Files changed (8)

+syntax: glob
+src/dfa.c # autogenerated by dfa.pxd

File replay/dfa.py

         self.antiset_transitions = antiset_transitions or {}
         
         self._validate()
-        self.states = self._list_states()
+        self._states = list(self._list_states())
+        self.states = set(self._states) # FIXME: necessary?
         self.implicit_alphabet = self._generate_implicit_alphabet()
         self.ordered_implicit_alphabet = sorted(self.implicit_alphabet)
     
                     "were also antiset-transitions: {%s}"
                     % (q, ', '.join(dup_keys)))
     
-    @util.builder(set)
+    @util.builder(util.unique)
     def _list_states(self):
         yield self.start
         for q in self.accepting: yield q

File replay/tests/test_c_dfa.py

+"""Some simple tests to establish behavior of C implementation of DFA on bytes.
+
+The Python compatibility test suite runs with the C implementation too, so,
+no worries there.
+"""
+import unittest
+
+from replay import dfa, _dfa
+
+class TestInterpreted(unittest.TestCase):
+    def test_bytes_spec(self):
+        """This is the easiest thing: work on bytes"""
+        d = dfa.DFA(0, {0: {u'a': 1}}, {0, 1})
+
+        c_d = _dfa._DFA(d)
+        self.assertTrue(c_d.run(b'a'))
+
+    def test_bytes_spec(self):
+        """This is harder: work on unicode, if we ask it."""
+        d = dfa.DFA(0, {0: {u'a': 1}}, {0, 1})
+
+        c_d = _dfa._DFA(d)
+        self.assertTrue(c_d.run_unicode('a'))

File replay/util.py

         return g
     return decorator
 
+def unique(stuff):
+    seen = set()
+    for x in stuff:
+        if x not in seen:
+            seen.add(x)
+            yield x
+
 def appender(somelist):
     """Add to a container. Useful for maintaining a registry"""
     _append = somelist.append
+#!/usr/bin/env python
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+
+TROVE_CLASSIFIERS = [ # lazy
+]
+
+dfa_extension = Extension(
+    'replay._dfa',
+    sources=[
+        'src/dfa.pyx',
+        #'src/dfa.pxd',
+        #'src/dfa.h',
+        'src/run_dfa.c',])
+
+setup(
+    name='replay',
+    version='0.1a', # figure out a scheme later, until then we are 0.1a
+    cmdclass = {'build_ext': build_ext},
+    ext_modules = [dfa_extension],
+    description='Regular expression toolkit',
+    author='Devin Jeanpierre',
+    author_email='jeanpierreda@gmail.com',
+    packages=[
+        'replay',
+        'replay.tests'],
+    url='https://bitbucket.org/devin.jeanpierre/replay/',
+    classifiers=TROVE_CLASSIFIERS,
+    license='MIT',)
+typedef struct bytes_dfa bytes_dfa;
+
+struct bytes_dfa {
+    int accepting;
+    bytes_dfa* transitions[256];
+};
+from libc.stdlib cimport malloc, free
+
+cdef struct bytes_dfa:
+    int accepting
+    bytes_dfa* transitions[256]
+ctypedef bytes_dfa BytesDfa # ew.
+
+cdef extern int c_run_bytes_dfa "run_bytes_dfa" (BytesDfa* dfa, char* string, size_t n)
+
+cdef class _DFA:
+    cdef BytesDfa* machine
+    def __cinit__(self, py_dfa):
+        self.machine = py_to_c_dfa(py_dfa)
+
+    def __dealloc__(self):
+        free(self.machine)
+
+    def run(self, string):
+        return bool(c_run_bytes_dfa(self.machine, string, len(string)))
+
+cdef BytesDfa* py_to_c_dfa(py_dfa):
+    cdef long x
+    cdef BytesDfa* c_nodes
+    cdef size_t next_i
+    n = len(py_dfa._states)
+    # +1 because we add an explicit dead state
+    c_nodes = <BytesDfa*>malloc((n+1)*sizeof(BytesDfa))
+
+    # just to speed things up a bit
+    state_index = {state:i for i, state in enumerate(py_dfa._states)}
+
+    cdef size_t i, j
+    for i in range(n):
+        c_node = c_nodes + i
+        py_state = py_dfa._states[i]
+
+        c_node.accepting = py_state in py_dfa.accepting
+        for j in range(256):
+            next_state = py_dfa.get_next(py_state, chr(j))
+            if next_state is None:
+                # dead state
+                next_i = n
+            else:
+                next_i = state_index[py_state]
+
+            c_node.transitions[j] = c_nodes + next_i
+
+    # this is safe, because the first state is always the start state.
+    # (so this can pretend to be a pointer to the start of the DFA, and freeing
+    #  this will free the whole DFA)
+    return c_nodes

File src/run_dfa.c

+#include <stddef.h>
+#include "dfa.h"
+
+/** Run `dfa` on memory `string` of length `n`
+ * FIXME: add optional early backout/fullmatch
+ */
+int run_bytes_dfa(bytes_dfa* dfa, char* string, size_t n) {
+    while (n) {
+        dfa = dfa->transitions[(size_t)(*string)];
+        string++;
+        n--;
+    }
+    return dfa->accepting;
+}