Commits

Kirill Simonov committed 9b3db74 Draft

A simple parser generator.

Comments (0)

Files changed (7)

+syntax: glob
+*.pyc
+*.pyo
+.*.sw?
+*.egg-info
+build
+dist
+sandbox
+README.html
+Copyright (c) 2012 Kirill Simonov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+****************************************
+  ppparse -- a simple parser generator
+****************************************
+
+The ``ppparse`` package generates a lexical scanner and a syntax parser for the
+given grammar.  Use it as in the following example::
+
+    from ppparse import make_scan, make_parse
+
+    scan = make_scan("""
+        SPACE   : [\s]+
+                !skip
+        NUMBER  : [\d]+
+        SYMBOL  : [()*/+-]
+                !symbol
+    """)
+
+    parse = make_parse("""
+        expr    : term
+                | expr `+` term
+                | expr `-` term
+        term    : factor
+                | term `*` factor
+                | term `/` factor
+        factor  : atom
+                | `+` factor
+                | `-` factor
+        atom    : NUMBER
+                | `(` expr `)`
+    """)
+
+    text = "(-4+15)*4 - 2"
+    tokens = scan(text)
+    syntax = parse(tokens)
+
+The function ``scan()`` generated by ``make_scan()`` takes a string or an open
+file as input and produces a list of ``Token`` objects.  The function
+``parse()`` generated by ``make_parse()`` takes a list of ``Token`` objects and
+produces a syntax tree composed of ``Syntax`` and ``Token`` objects.
+
+A ``Token`` instance has the following attributes:
+
+``code``: ``unicode``
+    The token type; usually coincides with the name of the production rule
+    which matched the token.
+``text``: ``unicode``
+    The token value.
+
+Each ``Syntax`` instance has the following attributes:
+
+``code``: ``unicode``
+    The type of the syntax node; coincides with the name of the rule which
+    produced the node.
+``arms``: list of ``Syntax`` and ``Token`` objects
+    Nodes which matched the production rule.
+
+

demo/demo_arith.py

+
+from ppparse import make_scan, make_parse
+
+LEX = """
+SPACE:  [\s]+
+        !skip
+NUMBER: [\d]+
+SYMBOL: [()*/+-]
+        !symbol
+"""
+
+SYN = """
+expr    : term
+        | expr `+` term
+        | expr `-` term
+term    : factor
+        | term `*` factor
+        | term `/` factor
+factor  : atom
+        | `+` factor
+        | `-` factor
+atom    : NUMBER
+        | `(` expr `)`
+"""
+
+EXAMPLES = [
+        "2+2",
+        "-34",
+        "(-3+7)*4-1"
+]
+
+scan = make_scan(LEX)
+parse = make_parse(SYN)
+
+for text in EXAMPLES:
+    print "="*79
+    print "INPUT:", text
+    print
+    print "TOKENS:"
+    tokens = scan(text)
+    for token in tokens:
+        print token
+    print
+    print "SYNTAX:"
+    syntax = parse(tokens)
+    print syntax
+    print
+

demo/demo_json.py

+
+from ppparse import make_scan, make_parse
+
+LEX = r"""
+$int        : [-]? ( [0] | [1-9] [0-9]* )
+$frac       : [.] [0-9]+
+$exp        : [eE] [+-] [0-9]+
+
+$char       : [^\0-\x1F"\\]
+$escape     : [\\] ["\\/bfnrt]
+$uescape    : [\\] [u] [0-9A-Fa-f]{4}
+
+NULL        : 'null'
+TRUE        : 'true'
+FALSE       : 'false'
+NUMBER      : $int $frac? $exp?
+STRING      : ["] ( $char | $escape | $uescape )* ["]
+SYMBOL      : [{}\[\]:,]
+            !symbol
+WHITESPACE  : [ \t\f\r\n]+
+            !skip
+"""
+
+SYN = """
+$pair       : STRING `:` value
+$members    : $pair ( `,` $pair )*
+$elements   : value ( `,` value )*
+
+value       : object    | array
+            | NUMBER    | STRING
+            | TRUE      | FALSE
+            | NULL
+object      : `{` $members? `}`
+array       : `[` $elements? `]`
+"""
+
+EXAMPLES = [
+        "true",
+        "[1, 2, 3, [4, 5, 6]]",
+        """ {"type": "number", "value": 1.01e-3, "units": null, "more": {} } """,
+]
+
+scan = make_scan(LEX)
+parse = make_parse(SYN)
+
+for text in EXAMPLES:
+    print "="*79
+    print "INPUT:", text
+    print
+    print "TOKENS:"
+    tokens = scan(text)
+    for token in tokens:
+        print token
+    print
+    print "SYNTAX:"
+    syntax = parse(tokens)
+    print syntax
+    print
+
+#
+# Copyright (c) 2012 Kirill Simonov
+# Released under MIT license, see `LICENSE` for details.
+#
+
+
+from setuptools import setup
+
+
+NAME = "ppparse"
+VERSION = "0.0.1"
+DESCRIPTION = "A simple parser generator"
+AUTHOR = "Kirill Simonov"
+AUTHOR_EMAIL = "xi@resolvent.net"
+LICENSE = "MIT"
+PACKAGES = ['ppparse']
+PACKAGE_DIR = {'': 'src'}
+
+
+setup(name=NAME,
+      version=VERSION,
+      description=DESCRIPTION,
+      author=AUTHOR,
+      author_email=AUTHOR_EMAIL,
+      license=LICENSE,
+      packages=PACKAGES,
+      package_dir=PACKAGE_DIR)
+
+

src/ppparse/__init__.py

+#
+# Copyright (c) 2012 Kirill Simonov
+# Released under MIT license, see `LICENSE` for details.
+#
+
+
+from __future__ import unicode_literals
+import collections
+import urllib
+import re
+
+
+#
+# Error reporting.
+#
+
+
+class Mark(object):
+    """Input context."""
+
+    def __init__(self, src, text, pos):
+        self.src = src
+        self.text = text
+        self.pos = pos
+
+    def excerpt(self):
+        line_start = self.text.rfind("\n", 0, self.pos) + 1
+        line_end = self.text.find("\n", self.pos)
+        if line_end == -1:
+            line_end = len(self.text)
+        row = self.text.count("\n", 0, self.pos) + 1
+        col = self.pos - line_start + 1
+        line = self.text[line_start:line_end]
+        return "%s:%s:%s\n> %s\n>%s^" % (self.src, row, col, line.rstrip(), " "*col)
+
+    def __unicode__(self):
+        return "%s:%s" % (self.src, self.pos)
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+    def __repr__(self):
+        return ("<%s %s>" % (self.__class__.__name__, self)).encode('utf-8')
+
+
+class Error(Exception):
+    """Parsing error with input context."""
+
+    def __init__(self, msg, mark):
+        self.msg = msg
+        self.mark = mark
+
+    def __unicode__(self):
+        return "%s at %s" % (self.msg, self.mark.excerpt())
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+
+#
+# Ordered mapping.
+#
+
+
+class omap(collections.MutableMapping):
+    """Ordered mapping.
+
+    Works almost like ``dict`` with the order of inserted keys preserved,
+    but ``iter(omap)`` generates *values* rather than keys.
+    """
+
+    def __init__(self, iterable=None):
+        self._keys = []
+        self._key_to_val = {}
+        if iterable is not None:
+            self.update(iterable)
+
+    def __eq__(self, other):
+        if not isinstance(other, omap):
+            return NotImplemented
+        return (self._keys == other._keys and self._key_to_val == other._key_to_val)
+
+    def __iter__(self):
+        for key in self._keys:
+            yield self._key_to_val[key]
+
+    def __len__(self):
+        return len(self._keys)
+
+    def __contains__(self, key):
+        return (key in self._key_to_val)
+
+    def __getitem__(self, key):
+        return self._key_to_val[key]
+
+    def iterkeys(self):
+        return iter(self._keys)
+
+    def itervalues(self):
+        for key in self._keys:
+            yield self._key_to_val[key]
+
+    def iteritems(self):
+        for key in self._keys:
+            yield (key, self._key_to_val[key])
+
+    def keys(self):
+        return list(self._keys)
+
+    def items(self):
+        return [(key, self._key_to_val[key]) for key in self._keys]
+
+    def values(self):
+        return [self._key_to_val[key] for key in self._keys]
+
+    def __setitem__(self, key, val):
+        if key not in self._key_to_val:
+            self._keys.append(key)
+        self._key_to_val[key] = val
+
+    def __delitem__(self, key):
+        # WARNING: O(N) behavior.
+        del self._key_to_val[key]
+        self._keys.remove(key)
+
+    def popitem(self):
+        key = self._keys.pop()
+        val = self._key_to_val[key]
+        del self._key_to_val[key]
+        return key, val
+
+    def clear(self):
+        self._keys = []
+        self._key_to_val = {}
+
+    def update(self, iterable):
+        if isinstance(iterable, collections.Mapping):
+            iterable = iterable.itervalues()
+        for key, val in iterable:
+            if key not in self._key_to_val:
+                self._keys.append(key)
+            self._key_to_val[key] = val
+
+
+#
+# Topological sorting.
+#
+
+
+def toposort(nodes, order):
+    """Topological sorting."""
+    ordered = []
+    visited = set()
+    active = []
+    stack = []
+    for node in nodes:
+        stack = [(node, None)]
+        while stack:
+            node, deps = stack.pop()
+            if node in visited:
+                continue
+            if deps is None:
+                if node in active:
+                    loop = active[active.index(node):]+[node]
+                    raise RuntimeError(loop)
+                active.append(node)
+                deps = order[node][:]
+            if deps:
+                dep = deps.pop()
+                stack.append((node, deps))
+                stack.append((dep, None))
+            else:
+                active.pop()
+                ordered.append(node)
+                visited.add(node)
+    return ordered
+
+
+#
+# Regular expressions.
+#
+
+
+class Pattern(object):
+    """Regular expression pattern."""
+
+    def encode(self, nfa, src, dst):
+        # Encodes the pattern as a path from `src` to `dst` in `nfa`.
+        raise NotImplementedError()
+
+    def __unicode__(self):
+        raise NotImplementedError()
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+    def __repr__(self):
+        return ("<%s %s>" % (self.__class__.__name__, self)).encode('utf-8')
+
+    def __call__(self):
+        # Converts the pattern to a DFA.
+
+        # Generates the initial NFA.
+        nfa = [ [] ]
+        self.encode(nfa, 0, None)
+
+        # Eliminate zero transitions.
+        zeros = []
+        exits = set()
+        for src in range(len(nfa)):
+            zeros.append([])
+            for sym, dst in nfa[src]:
+                if sym is None:
+                    if dst is None:
+                        exits.add(src)
+                    else:
+                        zeros[src].append(dst)
+        ordered = toposort(range(len(nfa)), zeros)
+        for src in ordered:
+            for dst in zeros[src]:
+                if dst in exits:
+                    exits.add(src)
+        for src in ordered:
+            events = []
+            for sym, dst in nfa[src]:
+                if sym is None:
+                    if dst is not None:
+                        events.extend(nfa[dst])
+                else:
+                    events.append((sym, dst))
+                    if dst in exits:
+                        events.append((sym, None))
+            nfa[src] = tuple(events)
+
+        # Eliminate duplicate states.
+        while len(set(nfa)) < len(nfa):
+            events_to_state = {}
+            for dup, events in enumerate(nfa):
+                if events in events_to_state:
+                    break
+                else:
+                    events_to_state[events] = dup
+            state = events_to_state[events]
+            del nfa[dup]
+            for src, events in enumerate(nfa):
+                nfa[src] = tuple((sym, state if dst == dup else
+                                       dst-1 if dst is not None and dst > dup else
+                                       dst) for sym, dst in events)
+
+        # Convert to DFA.
+        dfa = [{}]
+        if 0 not in exits:
+            start_set = frozenset([0])
+        else:
+            start_set = frozenset([0, None])
+        set_to_dfa_state = { start_set: 0 }
+        stack = [start_set]
+        while stack:
+            src_set = stack.pop()
+            dfa_src = set_to_dfa_state[src_set]
+            dfa_events = dfa[dfa_src]
+            sym_to_dst_set = {}
+            has_exit = False
+            for src in src_set:
+                if src is not None:
+                    for sym, dst in nfa[src]:
+                        sym_to_dst_set.setdefault(sym, set()).add(dst)
+                else:
+                    has_exit = True
+            if has_exit:
+                dfa_events[None] = None
+            for sym in sym_to_dst_set:
+                dst_set = frozenset(sym_to_dst_set[sym])
+                if dst_set in set_to_dfa_state:
+                    dfa_dst = set_to_dfa_state[dst_set]
+                else:
+                    dfa_dst = len(dfa)
+                    dfa.append({})
+                    stack.append(dst_set)
+                    set_to_dfa_state[dst_set] = dfa_dst
+                dfa_events[sym] = dfa_dst
+
+        return dfa
+
+
+class AltPat(Pattern):
+    """Alternation pattern."""
+
+    def __init__(self, arms):
+        self.arms = arms
+
+    def __unicode__(self):
+        return "(?: %s )" % " | ".join(unicode(arm) for arm in self.arms)
+
+    def encode(self, nfa, src, dst):
+        for arm in self.arms:
+            arm.encode(nfa, src, dst)
+
+
+class SeqPat(Pattern):
+    """Concatenation pattern."""
+
+    def __init__(self, arms):
+        self.arms = arms
+
+    def __unicode__(self):
+        return "(?: %s )" % " ".join(unicode(arm) for arm in self.arms)
+
+    def encode(self, nfa, src, dst):
+        i_src = None
+        i_dst = src
+        for arm in self.arms[:-1]:
+            i_src = i_dst
+            i_dst = len(nfa)
+            nfa.append([])
+            arm.encode(nfa, i_src, i_dst)
+        self.arms[-1].encode(nfa, i_dst, dst)
+
+
+class ModPat(Pattern):
+    """Repetition pattern."""
+
+    def __init__(self, arm, mod):
+        assert re.match(r"^[?*+]|[{][\d]+[}]|[{][\d]+[,][\d]+[}]$", mod)
+        self.arm = arm
+        self.mod = mod
+
+    def __unicode__(self):
+        return "%s%s" % (self.arm, self.mod)
+
+    def encode(self, nfa, src, dst):
+        if self.mod in ["?", "*"]:
+            nfa[src].append((None, dst))
+        if self.mod == "?":
+            self.arm.encode(nfa, src, dst)
+        elif self.mod in ["*", "+"]:
+            i_src = len(nfa)
+            nfa.append([])
+            i_dst = len(nfa)
+            nfa.append([])
+            nfa[src].append((None, i_src))
+            self.arm.encode(nfa, i_src, i_dst)
+            nfa[i_dst].append((None, i_src))
+            nfa[i_dst].append((None, dst))
+        else:
+            if "," in mod:
+                l, u = mod[1:-1].split(",")
+            else:
+                l = u = mod[1:-1]
+            l = int(l)
+            u = int(u)
+            if l == 0:
+                nfa[src].append((None, dst))
+            if u > 0 and u >= l:
+                i_dst = src
+                for k in range(l-1):
+                    i_src = i_dst
+                    i_dst = len(nfa)
+                    nfa.append([])
+                    self.arm.encode(nfa, i_src, i_dst)
+                for k in range(u-l):
+                    i_src = i_dst
+                    i_dst = len(nfa)
+                    nfa.append([])
+                    self.arm.encode(nfa, i_src, i_dst)
+                    self.arm.encode(nfa, i_src, dst)
+                self.arm.encode(nfa, i_dst, dst)
+
+
+class SymPat(Pattern):
+    """Symbol pattern."""
+
+    def __init__(self, sym):
+        self.sym = sym
+
+    def __unicode__(self):
+        return unicode(self.sym)
+
+    def encode(self, nfa, src, dst):
+        nfa[src].append((self.sym, dst))
+
+
+class EpsPat(Pattern):
+    """Zero pattern."""
+
+    def __unicode__(self):
+        return "()"
+
+    def encode(self, nfa, src, dst):
+        nfa[src].append((None, dst))
+
+
+class GrammarReader(object):
+    """
+    Scanner for grammar definitions.
+    """
+
+    def __init__(self, text, sym_pat, to_sym):
+        indent = None
+        for line in text.splitlines():
+            if line.lstrip():
+                line_indent = len(line)-len(line.lstrip())
+                if indent is None or line_indent < indent:
+                    indent = line_indent
+        if indent:
+            text = "\n".join(line[indent:] for line in text.splitlines())
+        self.text = text
+        self.pos = 0
+        self.sym_pat = sym_pat
+        self.to_sym = to_sym
+        self.skip(r"(?: [ ]* (?: [#] [^\n]* )? (?: [\n] | $ ) )+")
+
+    def pull(self, pat):
+        regex = re.compile(pat, re.X|re.U)
+        match = regex.match(self.text, self.pos)
+        if match is None:
+            return None
+        group = match.group()
+        self.pos = match.end()
+        self.skip()
+        return group
+
+    def peek(self, pat):
+        regex = re.compile(pat, re.X|re.U)
+        return (regex.match(self.text, self.pos) is not None)
+
+    def skip(self, pat=r"(?: [ ]+ | [#] [^\n]* | [\n] (?=[ #\n]) | [\n] $ )+"):
+        regex = re.compile(pat, re.X|re.U)
+        match = regex.match(self.text, self.pos)
+        if match is not None:
+            self.pos = match.end()
+
+    def pull_alt(self):
+        arms = [self.pull_seq()]
+        while self.pull(r"[|]") is not None:
+            arms.append(self.pull_seq())
+        return AltPat(arms) if len(arms) > 1 else arms[0]
+
+    def pull_seq(self):
+        arms = [self.pull_mod()]
+        while self.peek(r"[(]") or self.peek(self.sym_pat):
+            arms.append(self.pull_mod())
+        return SeqPat(arms) if len(arms) > 1 else arms[0]
+
+    def pull_mod(self):
+        arm = self.pull_atom()
+        mod = self.pull(r"[?*+] | [{] [\d]+ (?: [,] [\d]+ )? [}]")
+        return ModPat(arm, mod) if mod is not None else arm
+
+    def pull_atom(self):
+        if self.pull(r"[(][ ]*[)]") is not None:
+            return EpsPat()
+        elif self.pull(r"[(]") is not None:
+            pat = self.pull_alt()
+            if self.pull(r"[)]") is None:
+                raise self.fail("missing ')'")
+            return pat
+        sym = self.pull(self.sym_pat)
+        if sym is None:
+            raise self.fail("ill-formed pattern")
+        if self.to_sym is not None:
+            sym = self.to_sym(sym)
+        return SymPat(sym) if not isinstance(sym, Pattern) else sym
+
+    def mark(self):
+        return Mark("<grammar>", self.text, self.pos)
+
+    def fail(self, msg):
+        return Error(msg, self.mark())
+
+
+#
+# Scanner generator.
+#
+
+
+LexSection = collections.namedtuple('LexSection', ['name', 'rules', 'mark'])
+LexRule = collections.namedtuple('LexRule', ['name', 'pattern', 'is_skip', 'is_symbol', 'pop', 'push', 'mark'])
+
+
+def make_scan(grammar):
+    """Generates a scanner for the given lexical grammar."""
+    # Sanity check the the argument.
+    if hasattr(grammar, 'read'):
+        grammar = grammar.read()
+    if isinstance(grammar, str):
+        grammar = grammar.decode('utf-8')
+    assert isinstance(grammar, unicode)
+
+    # Parse the grammar definition.
+    sections = omap()
+
+    sym_pat = r"[$][\w]+ | [\[] (?: [^\\\]] | \\. )* [\]] | ['] (?: [^'] | [']['] )+ ['] | [\^] | [$]"
+    subs = {}
+    def to_sym(text):
+        if text[0] == "$" and text != "$":
+            name = text[1:]
+            if name not in subs:
+                raise reader.fail("unknown substitution (%s)" % text)
+            return subs[name]
+        elif text[0] == text[-1] == "'":
+            return re.escape(text[1:-1].replace("''", "'"))
+        else:
+            return text
+    reader = GrammarReader(grammar, sym_pat, to_sym)
+
+    while not reader.peek(r"$"):
+        mark = reader.mark()
+        if reader.pull(r"[\[]") is not None:
+            name = reader.pull(r"[\w]+")
+            if name is None:
+                raise reader.fail("expected section name")
+            if reader.pull(r"[\]]") is None:
+                raise reader.fail("missing ']'")
+            if reader.pull(r"[\n] | $") is None:
+                raise reader.fail("expected end of line")
+            if name in self.sections:
+                raise reader.fail("duplicate section (%s)" % name)
+            section = LexSection(name, omap(), mark)
+            sections[section.name] = section
+        elif reader.pull(r"[$]") is not None:
+            name = reader.pull(r"[\w]+")
+            if name is None:
+                raise reader.fail("expected substitution name")
+            if reader.pull(r"[:]") is None:
+                raise reader.fail("missing ':'")
+            pattern = reader.pull_alt()
+            if reader.pull(r"[\n] | $") is None:
+                raise reader.fail("expected end of line")
+            subs[name] = pattern
+        else:
+            name = reader.pull(r"[\w]+")
+            if name is None:
+                raise reader.fail("expected token name")
+            if not name[0].isupper():
+                raise reader.fail("token name must be capitalized (%s)" % name)
+            if reader.pull(r"[:]") is None:
+                raise reader.fail("missing ':'")
+            pattern = reader.pull_alt()
+            is_skip = False
+            is_symbol = False
+            pop = None
+            push = None
+            while reader.peek(r"[!]"):
+                option = reader.pull(r"[!] [\w]+ (?: [:] [\w]+ )?")
+                if option is None:
+                    raise reader.fail("ill-formed option")
+                option = option[1:]
+                if "=" in option:
+                    key, val = option.split("=")
+                else:
+                    key = option
+                    val = True
+                if key == "skip" and val is True:
+                    is_skip = True
+                elif key == "symbol" and val is True:
+                    is_symbol = True
+                elif key == "pop" and val is True or val.isdigit():
+                    pop = val
+                elif key == "push" and val is not True:
+                    push = val
+                else:
+                    raise reader.fail("invalid option (%s)" % key)
+            if reader.pull(r"[\n] | $") is None:
+                raise reader.fail("expected end of line")
+            if not sections:
+                section = LexSection("", omap(), mark)
+                sections[section.name] = section
+            else:
+                section = sections.values()[-1]
+            if name in section.rules:
+                raise self.fail("duplicate rule (%s)" % name)
+            rule = LexRule(name, unicode(pattern), is_skip, is_symbol, pop, push, mark)
+            section.rules[rule.name] = rule
+
+    # Sanity check for the grammar definition.
+    if not sections:
+        raise reader.fail("empty grammar")
+    for section in sections:
+        if not section.rules:
+            raise Error("empty section (%s)" % section.name, section.mark)
+        for rule in section.rules:
+            if rule.push is not None and rule.push not in sections:
+                raise Error("undefined section (%s)" % rule.push, rule.mark)
+
+    # Prepare scan tables and generate the scanner.
+    tables = omap()
+    for section in sections:
+        groups = []
+        patterns = []
+        for rule in section.rules:
+            patterns.append("(?P<%s> %s )" % (rule.name, rule.pattern))
+            groups.append(ScanGroup(rule.name, rule.is_skip, rule.is_symbol, rule.pop, rule.push))
+        regex = re.compile(" | ".join(patterns), re.X|re.U)
+        assert regex.match('1')
+        table = ScanTable(section.name, regex, groups)
+        tables[table.name] = table
+    return Scanner(tables)
+
+
+#
+# Lexical scanner.
+#
+
+
+class Token(object):
+
+    __slots__ = ('code', 'text', 'mark')
+
+    def __init__(self, code, text, mark):
+        self.code = code
+        self.text = text
+        self.mark = mark
+
+    def __nonzero__(self):
+        return bool(self.code)
+
+    def __unicode__(self):
+        if not self.code:
+            return "$"
+        return "%s(%s)" % (self.code if self.code.isalnum() else
+                          "`%s`" % self.code.replace("`", "``"),
+                          urllib.quote(self.text.encode('utf-8'), safe="").decode('utf-8'))
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+    def __repr__(self):
+        return ("<%s %s>" % (self.__class__.__name__,
+                             "$" if not self.code else
+                             self.code if self.code.isalnum() else
+                             "`%s`" % self.code.replace("`", "``"))).encode('utf-8')
+
+
+ScanTable = collections.namedtuple('ScanTable', ['name', 'regex', 'groups'])
+ScanGroup = collections.namedtuple('ScanGroup', ['name', 'is_skip', 'is_symbol', 'pop', 'push'])
+
+
+class Scanner(object):
+    """Lexical scanner."""
+
+    def __init__(self, tables):
+        self.tables = tables
+
+    def __call__(self, text):
+        # Prepare the input.
+        if hasattr(text, 'name'):
+            src = text.name
+        else:
+            src = "<input>"
+        if isinstance(src, str):
+            src = src.decode('utf-8', 'replace')
+        if hasattr(text, 'read'):
+            text = text.read()
+        if isinstance(text, str):
+            try:
+                text = text.decode('utf-8')
+            except UnicodeDecodeError, exc:
+                mark = Mark(src, text.decode('utf-8', 'replace'), exc.start)
+                raise Error("input is not valid UTF-8 (%s)" % exc.reason, mark)
+        assert isinstance(text, unicode)
+
+        # Process the input.
+        stack = [next(iter(self.tables))]
+        tokens = []
+        pos = 0
+        while pos < len(text):
+            mark = Mark(src, text, pos)
+            if not stack:
+                raise Error("unexpected end of input", mark)
+            table = stack[-1]
+            match = table.regex.match(text, pos)
+            if match is None:
+                raise Error("unexpected input", mark)
+            for group in table.groups:
+                block = match.group(group.name)
+                if block is not None:
+                    break
+            else:
+                assert False
+            if not block and not group.pop and not group.push:
+                raise Error("stuck", mark)
+            pos = match.end()
+            if not group.is_skip:
+                code = group.name
+                if group.is_symbol:
+                    code = block
+                tokens.append(Token(code, block, mark))
+            if group.pop:
+                stack = stack[:-group.pop]
+            if group.push:
+                stack.append(self.tables[group.push])
+
+        mark = Mark(src, text, pos)
+        tokens.append(Token("", "", mark))
+        return tokens
+
+
+#
+# Parser generator.
+#
+
+
+SynRule = collections.namedtuple('SynRule', ['name', 'dfa', 'mark'])
+
+
+def make_parse(grammar):
+    """Generates a parser from the given syntax grammar."""
+    # Sanity check the the argument.
+    if hasattr(grammar, 'read'):
+        grammar = grammar.read()
+    if isinstance(grammar, str):
+        grammar = grammar.decode('utf-8')
+    assert isinstance(grammar, unicode)
+
+    # Parse the grammar definition.
+    rules = omap()
+
+    sym_pat = r"[$][\w]* | [\w]+ | [`] (?: [^`] | [`][`] )* [`]"
+    subs = {}
+    def to_sym(text):
+        if text == "$":
+            return ("", True)
+        elif text[0] == "$":
+            name = text[1:]
+            if name not in subs:
+                raise reader.fail("unknown substitution (%s)" % text)
+            return subs[name]
+        elif text[0] == text[-1] == "`":
+            return (text[1:-1].replace("``", "`"), True)
+        elif text[0].isupper():
+            return (text, True)
+        else:
+            return (text, False)
+    reader = GrammarReader(grammar, sym_pat, to_sym)
+
+    while not reader.peek(r"$"):
+        mark = reader.mark()
+        if reader.pull(r"[$]") is not None:
+            name = reader.pull(r"[\w]+")
+            if name is None:
+                raise reader.fail("expected substitution name")
+            if reader.pull(r"[:]") is None:
+                raise reader.fail("missing ':'")
+            pattern = reader.pull_alt()
+            if reader.pull(r"[\n] | $") is None:
+                raise reader.fail("expected end of line")
+            subs[name] = pattern
+        else:
+            name = reader.pull(r"[\w]+")
+            if name is None:
+                raise reader.fail("expected token name")
+            if name[0].isupper():
+                raise reader.fail("production name must not be capitalized (%s)" % name)
+            if reader.pull(r"[:]") is None:
+                raise reader.fail("missing ':'")
+            pattern = reader.pull_alt()
+            if reader.pull(r"[\n] | $") is None:
+                raise reader.fail("expected end of line")
+            if name in rules:
+                raise self.fail("duplicate rule (%s)" % name)
+            rule = SynRule(name, pattern(), mark)
+            rules[rule.name] = rule
+
+    # Sanity check for the grammar definition.
+    if not rules:
+        raise reader.fail("empty grammar")
+    for rule in rules:
+        if None in rule.dfa[0]:
+            raise Error("epsilon production (%s)" % rule.name, rule.mark)
+        for events in rule.dfa:
+            for event in events:
+                if event is not None:
+                    symbol, is_terminal = event
+                    if not is_terminal and symbol not in rules:
+                        raise Error("undefined production (%s)" % symbol, rule.mark)
+
+    # Generate the FIRST set.
+    deps = {}
+    for rule in rules:
+        deps[rule.name] = [symbol
+                for symbol, is_terminal in sorted(rule.dfa[0])
+                if not is_terminal and symbol != rule.name]
+    try:
+        ordered = toposort(rules.keys(), deps)
+    except RuntimeError, exc:
+        [loop] = exc.args
+        raise Error("left recursion (%s)" % " -> ".join(loop), rules[loop[0]].mark)
+    first = {}
+    for name in ordered:
+        rule = rules[name]
+        first[rule.name] = set()
+        for symbol, is_terminal in rule.dfa[0]:
+            if is_terminal:
+                first[rule.name].add(symbol)
+            else:
+                first[rule.name] |= first[symbol]
+
+    # Prepare parse tables and generate the parser.
+    tables = omap()
+    for rule in rules:
+
+        machine = []
+        for events in rule.dfa:
+
+            has_exit = False
+            terminals = set()
+            nonterminals = set()
+            for event in events:
+                if event is None:
+                    has_exit = True
+                else:
+                    symbol, is_terminal = event
+                    if is_terminal:
+                        terminals.add(symbol)
+                    elif not (symbol == rule.name and not machine):
+                        nonterminals.add(symbol)
+
+            conflicts = {}
+            for symbol in sorted(nonterminals):
+                for code in sorted(first[symbol]):
+                    if code in terminals:
+                        continue
+                    if code in conflicts:
+                        raise Error("conflict between productions %s and %s (%s)"
+                                % (conflicts[code], symbol, rule.name), rule.mark)
+                    conflicts[code] = symbol
+
+            transitions = {}
+            if has_exit:
+                transitions[None] = (None, None)
+            for symbol in nonterminals:
+                dst = events[symbol, False]
+                if len(nonterminals) == 1 and not has_exit:
+                    transitions[None] = (symbol, dst)
+                else:
+                    for code in first[symbol]:
+                        transitions[code] = (symbol, dst)
+            for symbol in terminals:
+                dst = events[symbol, True]
+                transitions[symbol] = (None, dst)
+            machine.append(transitions)
+        left = rule.dfa[0].get((rule.name, False))
+        table = ParseTable(rule.name, machine, left)
+        tables[table.name] = table
+
+    return Parser(tables)
+
+
+#
+# Syntax parsing.
+#
+
+
+class Syntax(object):
+
+    __slots__ = ('code', 'arms', 'mark')
+
+    def __init__(self, code, arms, mark):
+        self.code = code
+        self.arms = arms
+        self.mark = mark
+
+    def __unicode__(self):
+        chunks = []
+        chunks.append("%s:" % self.code)
+        for arm in self.arms:
+            lines = unicode(arm).splitlines()
+            chunks.append("- "+lines[0])
+            for line in lines[1:]:
+                chunks.append("  "+line)
+        return "\n".join(chunks)
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+    def __repr__(self):
+        return ("<%s %s>" % (self.__class__.__name__, self.code)).encode('utf-8')
+
+
+ParseTable = collections.namedtuple('ParseTable', ['name', 'machine', 'left'])
+
+
+class Parser(object):
+
+    def __init__(self, tables):
+        self.tables = tables
+
+    def __call__(self, tokens):
+        stack = [(next(iter(self.tables)), 0, [])]
+        idx = 0
+        while stack:
+            table, state, arms = stack.pop()
+            transitions = table.machine[state]
+            token = tokens[idx]
+            if token and token.code in transitions:
+                symbol, dst = transitions[token.code]
+            elif None in transitions:
+                symbol, dst = transitions[None]
+            else:
+                if token:
+                    raise Error("unexpected input for %s (%s)" % (table.name, token.code), token.mark)
+                else:
+                    raise Error("unexpected end of input for %s" % table.name, token.mark)
+            if dst is not None:
+                if symbol is None:
+                    arms.append(token)
+                    idx += 1
+                stack.append((table, dst, arms))
+                if symbol is not None:
+                    stack.append((self.tables[symbol], 0, []))
+            else:
+                node = Syntax(table.name, arms, arms[0].mark)
+                if token and table.left is not None and token.code in table.machine[table.left]:
+                    stack.append((table, table.left, [node]))
+                elif stack:
+                    table, state, arms = stack.pop()
+                    arms.append(node)
+                    stack.append((table, state, arms))
+                else:
+                    if token:
+                        raise Error("unexpected input for %s (%s)" % (table.name, token.code), token.mark)
+
+        return node
+
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.