Commits

Martin Vejnár  committed 73c4f93

ebnf_parse finished, added DocParser.

  • Participants
  • Parent commits 785cf70

Comments (0)

Files changed (6)

File docparser.py

+from ebnf_grammar import ebnf_parse
+from grammar import Grammar
+from lrparser import Parser
+
+class Test:
+    def p_root(self, minus, int_digits, float_digits):
+        """ float = ["-"], { "1" }, '.', { "1" }; """
+        return (minus, int_digits, float_digits)
+
+class DocParser:
+    def __init__(self, cls):
+        rules = []
+        for name in dir(cls):
+            if name[:2] != 'p_':
+                continue
+            action = getattr(cls, name)
+            grammar = action.__doc__
+            print grammar
+            if not grammar:
+                continue
+            rules.extend(ebnf_parse(grammar, action))
+        self.cls = cls
+        self.grammar = Grammar(*rules)
+        self.parser = Parser(self.grammar)
+        
+    def parse(self, input, context=None):
+        return self.parser.parse(input, context=context or self.cls())
+
+if __name__ == '__main__':
+    p = DocParser(Test)
+    print p.grammar
+    print p.parse('-111.11')
+    print p.parse('111.11')

File ebnf_grammar.py

+"""
+Provides a conversion between EBNF (ISO 14977:1996) and Rule objects.
+
+The core functionality is provided via the 'ebnf_parse' function,
+which receives a string representing the ebnf grammar
+and returns the list of corresponding Rule objects.
+
+>>> grammar_in_ebnf = 'list = { item }; item = "i";'
+>>> for rule in ebnf_parse(grammar_in_ebnf):
+...     print rule
+list ::= @2
+@2 ::= <empty>
+@2 ::= @2 @1
+@1 ::= item
+item ::= i
+
+The helper function 'parse_from_ebnf' uses 'ebnf_parse'
+to generate the list of Rule objects, constructs a grammar, a parser
+and finally uses the parser to parse an input.
+
+The following subset of the ISO specification is supported:
+
+grammar = { rule };
+rule = symbol, '=', [ list of alternatives ], ';';
+list of alternatives = item sequence, { '|', item sequence }
+item sequence = item, { ',', item };
+item = symbol | quoted literal
+    | '(', list of alternatives, ')'    # grouping
+    | '[', list of alternatives, ']'    # optionality
+    | '{', list of alternatives, '}';   # repetition
+symbol = identifier, { identifier };
+
+The symbol on the left side of the first rule is considered the starting symbol
+and the corresponding Rule object will be the first in the result list.
+
+Actions associated with the returned Rule objects group the subexpressions
+in the following manner.
+
+For a list of alternatives 'X = a_1 | a_2;', the value of 'X' is 
+that of the alternative.
+
+>>> parse_from_ebnf('a', 'root = "a" | "b";')
+('a',)
+
+For a sequential composition 'X = a_1, a_2;'
+the value of 'X' is a tuple of values of subexpressions. Literals quoted
+with apostrophes and symbols whose name starts with an underscore are ommited
+from the tuple.
+
+>>> parse_from_ebnf('ab', 'root = "a", "b" | "c";')
+('a', 'b')
+>>> parse_from_ebnf('ab', '''root = "a", 'b' | "c";''')
+('a',)
+>>> parse_from_ebnf('c', 'root = "a", "b" | "c";')
+('c',)
+
+Optional subexpressions have the value of the nested subexpression
+or None if it's missing. 
+
+>>> parse_from_ebnf('a', 'root = "a", ["b"] | "c";')
+('a', None)
+>>> parse_from_ebnf('ab', 'root = "a", ["b"] | "c";')
+('a', 'b')
+
+The repetition introduces a list.
+
+>>> parse_from_ebnf('a', 'root = "a", {"b"} | "c";')
+('a', [])
+>>> parse_from_ebnf('abb', 'root = "a", {"b"} | "c";')
+('a', ['b', 'b'])
+
+The grouping has the same value as the subexpression,
+except when the value is a one-tuple. In that case, is is unboxed.
+
+>>> parse_from_ebnf('a', 'root = {("a")};')
+(['a'],)
+>>> parse_from_ebnf('ab', 'root = {("a", "b")};')
+([('a', 'b')],)
+
+The optional argument 'action' is called after each ebnf reduction
+to tranform the value of the corresponding nonterminal.
+
+>>> parse_from_ebnf('aaa', 'root = { "a" };', action=lambda self, list: '.'.join(list))
+'a.a.a'
+"""
+
+from rule import Rule
+from grammar import Grammar
+from lrparser import Parser
+from simple_lexer import simple_lexer
+
+class _Item:
+    def __init__(self, syms, visibility, rules):
+        self.syms = syms
+        self.visibility = visibility
+        self.rules = rules
+        
+    def __repr__(self):
+        return repr((self.syms, self.rules))
+        
+class _ActionFilter:
+    def __init__(self, action, visibilities):
+        self.action = action
+        self.visibilities = visibilities
+        
+    def __call__(self, self2, *args):
+        return self.action(self2, *(arg for j, arg in enumerate(args) if self.visibilities[j]))
+
+class _AltList:
+    def __init__(self, item):
+        self.items = [item.syms]
+        self.visibilities = [item.visibility]
+        self.rules = item.rules
+
+    def __repr__(self):
+        return repr((self.items, self.visibilities, self.rules))
+
+    def make_rules(self, unique, action=lambda self, *args: tuple(args) if len(args) != 1 else args[0]):
+        for i, item in enumerate(self.items):
+            self.rules.append(Rule(unique, *item,
+                action=_ActionFilter(action, self.visibilities[i])))
+        return self.rules
+
+def _concat_list(self, list, item):
+    list.append(item)
+    return list
+
+class _Context:
+    def __init__(self, action):
+        self.counter = 0
+        self.action = action
+    
+    def unique(self):
+        self.counter += 1
+        return '@%d' % self.counter
+    
+    def item_sym(self, sym):
+        return _Item([sym], [True], [])
+    
+    def item_lit(self, quote, lit):
+        visible = quote == '"'
+        return _Item([ch for ch in lit[1]], [visible for ch in lit[1]], [])
+        
+    def item_group(self, _1, alt_list, _2):
+        new_sym = self.unique()
+        return _Item([new_sym], [True], alt_list.make_rules(new_sym))
+        
+    def item_rep(self, _1, alt_list, _2):
+        new_sym = self.unique()
+        rules = alt_list.make_rules(new_sym)
+        rep_sym = self.unique()
+        rules.append(Rule(rep_sym, rep_sym, new_sym, action=_concat_list))
+        rules.append(Rule(rep_sym, action=lambda self: []))
+        return _Item([rep_sym], [True], rules)
+
+    def item_opt(self, _1, alt_list, _2):
+        new_sym = self.unique()
+        rules = alt_list.make_rules(new_sym)
+        rules.append(Rule(new_sym, action=lambda self: None))
+        return _Item([new_sym], [True], rules)
+
+    def seq_list_concat(self, seq_list, _1, item):
+        seq_list.rules.extend(item.rules)
+        seq_list.visibility.extend(item.visibility)
+        seq_list.syms.extend(item.syms)
+        return seq_list
+        
+    def new_alt_list(self, seq_list):
+        return _AltList(seq_list)
+        
+    def append_alt_list(self, alt_list, _1, seq_list):
+        alt_list.rules.extend(seq_list.rules)
+        alt_list.visibilities.append(seq_list.visibility)
+        alt_list.items.append(seq_list.syms)
+        return alt_list
+        
+    def empty_rule(self, sym, _1, _2):
+        return [Rule(sym, action=self.action)]
+        
+    def make_rule(self, sym, _1, alt_list, _2):
+        return alt_list.make_rules(sym, self.action)
+        
+    def append_rule(self, rule_list, rule):
+        rule.reverse()
+        rule_list.extend(rule)
+        return rule_list
+
+_ebnf_grammar = Grammar(
+    Rule('rule_list', action=lambda self: []),
+    Rule('rule_list', 'rule_list', 'rule', action=_Context.append_rule),
+    
+    Rule('rule', 'sym', '=', ';', action=_Context.empty_rule),
+    Rule('rule', 'sym', '=', 'alt_list', ';', action=_Context.make_rule),
+    
+    Rule('alt_list', 'seq_list', action=_Context.new_alt_list),
+    Rule('alt_list', 'alt_list', '|', 'seq_list', action=_Context.append_alt_list),
+    
+    Rule('seq_list', 'item', action=lambda self, item: item),
+    Rule('seq_list', 'seq_list', ',', 'item', action=_Context.seq_list_concat),
+    
+    Rule('item', 'sym', action=_Context.item_sym),
+    Rule('item', '\'', 'ql', action=_Context.item_lit),
+    Rule('item', '"', 'ql', action=_Context.item_lit),
+    Rule('item', '(', 'alt_list', ')', action=_Context.item_group),
+    Rule('item', '{', 'alt_list', '}', action=_Context.item_rep),
+    Rule('item', '[', 'alt_list', ']', action=_Context.item_opt),
+    
+    Rule('sym', 'id', action=lambda self, id: id[1]),
+    Rule('sym', 'sym', 'id', action=lambda self, sym, id: ' '.join((sym, id[1])))
+    )
+    
+_ebnf_parser = Parser(_ebnf_grammar, k=1)
+
+def ebnf_parse(input, action=lambda self, *args: args):
+    """Parses an EBNF grammar and returns a corresponding list of Rule objects."""
+    return _ebnf_parser.parse(simple_lexer(input), context=_Context(action))
+
+def parse_from_ebnf(input, ebnf, action=lambda self, *args: args):
+    """Parses an EBNF grammar, creates a parser for it and parses an input with it."""
+    return Parser(Grammar(*ebnf_parse(ebnf, action=action))).parse(input)
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+from grammar import Grammar, Rule
+from first import First
+
+# TODO: Fix 'shift' and 'reduce' tokens (a bool perhaps?).
+
+class InvalidGrammarError(BaseException):
+    """Raised during a construction of a parser, if the grammar is not LR(k)."""
+    pass
+
+class ParsingError(BaseException):
+    """Raised by a parser if the input word is not a sentence of the grammar."""
+    pass
+    
+class Parser:
+    """Represents a LR(k) parser.
+    
+    The parser is created with a grammar and a 'k'. The LR parsing tables
+    are created during construction. If the grammar is not LR(k),
+    an InvalidGrammarException is raised.
+    
+    >>> not_a_lr0_grammar = Grammar(Rule('list'), Rule('list', 'item', 'list'))
+    >>> Parser(not_a_lr0_grammar, k=0) # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+        ...
+    InvalidGrammarError: LR(0) table conflict ...
+
+    >>> lr0_grammar = Grammar(
+    ...     Rule('list', action=lambda c: [] if c == None else [c]),
+    ...     Rule('list', 'list', 'item', action=lambda c, l, i: l + [i]))
+    >>> p = Parser(lr0_grammar, k=0)
+    >>> print p.grammar
+    list ::= <empty>
+    list ::= list item
+    
+    The method 'parse' will accept an iterable of tokens (which are arbitrary objects)
+    and an extraction function. The extraction function should return the terminal symbol
+    associated with the token. By default, the extraction function return the object itself
+    or the first first field if it is a tuple. Whenever the parser reduces a word to a non-terminal,
+    an action associated with the reduction rule is executed. This way Abstract Syntax Trees
+    or other objects can be constructed. The parse method returns the result of an action
+    associated with the topmost reduction rule.
+    
+    Optionally, the 'parse' function will accept a 'context' keyword argument.
+    This is passed to an action when reduction occurs. By default, context is None.
+    
+    >>> p.parse(())
+    []
+    >>> p.parse(('item', 'item', 'item', 'item'))
+    ['item', 'item', 'item', 'item']
+    >>> p.parse('spam', extract=lambda x: 'item')
+    ['s', 'p', 'a', 'm']
+    >>> p.parse('gg', extract=lambda x: 'item', context='e')
+    ['e', 'g', 'g']
+    
+    If an error occurs during parsing, a ParsingError is raised.
+    
+    >>> p.parse('spam')
+    Traceback (most recent call last):
+        ...
+    ParsingError: Unexpected input token: 's'
+    """
+    
+    def __init__(self, grammar, k=1, keep_states=False):
+        
+        if grammar.root() == None:
+            raise InvalidGrammarError('There must be at least one rule in the grammar.')
+            
+        self.grammar = Grammar(*grammar)
+        self.k = k
+        
+        # Augment the grammar with a special rule: 'S -> R',
+        # where S is a new non-terminal (in this case '').
+        aug_grammar = Grammar(Rule('', grammar.root()), *grammar)
+            
+        first = First(aug_grammar, k)
+        
+        def _close_itemset(itemset):
+            i = 0
+            while i < len(itemset):
+                curitem = itemset[i]
+                
+                for next_lookahead in curitem.next_lookaheads(first):
+                    for next_rule in aug_grammar.rules(curitem.next_token()):
+                        newitem = _Item(next_rule, 0, next_lookahead)
+                        if newitem not in itemset:
+                            itemset.append(newitem)
+                
+                i += 1
+                
+        def _goto(itemset, symbol):
+            res = []
+            for item in itemset:
+                if item.next_token() != symbol:
+                    continue
+                    
+                res.append(_Item(item.rule, item.index + 1, item.lookahead))
+                
+            _close_itemset(res)
+            return set(res)
+        
+        itemset = [_Item(aug_grammar[0], 0, ())]
+        _close_itemset(itemset)
+        
+        states = [set(itemset)]
+        
+        goto_table = {}
+        
+        done = False
+        while not done:
+            done = True
+            
+            i = 0
+            while i < len(states):
+                itemset = states[i]
+                
+                for symbol in aug_grammar.symbols():
+                    newstate = _goto(itemset, symbol)
+                    if len(newstate) == 0:
+                        continue
+                        
+                    for j, state in enumerate(states):
+                        if newstate == state:
+                            goto_table[i, symbol] = j
+                            break
+                    else:
+                        goto_table[i, symbol] = len(states)
+                        states.append(newstate)
+                        done = False
+                
+                i += 1
+        
+        action_table = {}
+        accepting_state = None
+        
+        def add_action(state_id, lookahead, action, item):
+            key = (state_id, lookahead)
+            if key in action_table and action_table[key] != action:
+                raise InvalidGrammarError('LR(%i) table conflict at %s: actions %s, %s trying to add %s' % (k, key, action_table[key], action, item))
+            action_table[key] = action
+        
+        for state_id, state in enumerate(states):
+            for item in state:
+                nt = item.next_token()
+                if nt == None:
+                    if item.rule.left == '':
+                        accepting_state = state_id
+                        add_action(state_id, item.lookahead, ('shift',), item)
+                    else:
+                        add_action(state_id, item.lookahead, ('reduce', item.rule), item)
+                elif aug_grammar.is_terminal(nt):
+                    for la in item.lookaheads(first):
+                        add_action(state_id, la, ('shift',), item)
+        
+        assert accepting_state != None
+        self.goto = goto_table
+        self.action = action_table
+        self.accepting_state = accepting_state
+        self.k = k
+        
+        if keep_states:
+            self.states = states
+
+    def parse(self, sentence, context=None, extract=lambda arg: arg[0] if type(arg) == tuple else arg, prereduce_visitor=None, postreduce_visitor=None):
+        it = iter(sentence)
+        buf = []
+        while len(buf) < self.k:
+            try:
+                buf.append(it.next())
+            except StopIteration:
+                break
+                    
+        def get_shift_token():
+            if len(buf) == 0:
+                try:
+                    return it.next()
+                except StopIteration:
+                    return None
+            else:
+                res = buf.pop(0)
+                try:
+                    buf.append(it.next())
+                except StopIteration:
+                    pass
+                return res
+        
+        states = [0]
+        asts = []
+        while True:
+            state = states[-1]
+            
+            key = (state, tuple(extract(token) for token in buf))
+            if key not in self.action:
+                raise ParsingError('Unexpected input token: %s' % repr(tuple(buf)))
+            
+            action = self.action[key]
+            if action[0] == 'reduce':
+                rule = action[1]
+                
+                if len(rule.right) > 0:
+                    if prereduce_visitor:
+                        prereduce_visitor(*asts[-len(rule.right):])
+                    new_ast = rule.action(context, *asts[-len(rule.right):])
+                    if postreduce_visitor:
+                        postreduce_visitor(rule, new_ast)
+                    del states[-len(rule.right):]
+                    del asts[-len(rule.right):]
+                else:
+                    if prereduce_visitor:
+                        prereduce_visitor()
+                    new_ast = rule.action(context)
+                    if postreduce_visitor:
+                        postreduce_visitor(rule, new_ast)
+                
+                states.append(self.goto[states[-1], rule.left])
+                asts.append(new_ast)
+            else: # shift
+                tok = get_shift_token()
+                if tok == None:
+                    if state == self.accepting_state:
+                        assert len(asts) == 1
+                        return asts[0]
+                    else:
+                        raise ParsingError('Reached the end of file prematurely.')
+                
+                key = (state, extract(tok))
+                if key not in self.goto:
+                    raise ParsingError('Unexpected input token: %s' % repr(key[1]))
+                
+                states.append(self.goto[key])
+                asts.append(tok)
+
+class _Item:
+    def __init__(self, rule, index, lookahead):
+        self.rule = rule
+        self.index = index
+        self.lookahead = lookahead
+        
+        self.final = len(self.rule.right) <= self.index
+    
+    def __cmp__(self, other):
+        return cmp(
+            (self.rule, self.index, self.lookahead),
+            (other.rule, other.index, other.lookahead))
+    
+    def __hash__(self):
+        return hash((self.rule, self.index, self.lookahead))
+        
+    def __str__(self):
+        out = [self.rule.left, '::=']
+        out.extend(self.rule.right)
+        out.insert(self.index + 2, '.')
+        return ' '.join(out) + ' ' + str(self.lookahead)
+    
+    def __repr__(self):
+        return ''.join(("'", self.__str__(), "'"))
+    
+    def next_token(self):
+        return self.rule.right[self.index] if not self.final else None
+    
+    def next_lookaheads(self, first):
+        rule_suffix = self.rule.right[self.index + 1:]
+        word = rule_suffix + self.lookahead
+        return first(word)
+    
+    def lookaheads(self, first):
+        rule_suffix = self.rule.right[self.index:]
+        word = rule_suffix + self.lookahead
+        return first(word)
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()

File parser.py

-from grammar import Grammar, Rule
-from first import First
-
-# TODO: Fix 'shift' and 'reduce' tokens (a bool perhaps?).
-
-class InvalidGrammarError(BaseException):
-    """Raised during a construction of a parser, if the grammar is not LR(k)."""
-    pass
-
-class ParsingError(BaseException):
-    """Raised by a parser if the input word is not a sentence of the grammar."""
-    pass
-    
-class Parser:
-    """Represents a LR(k) parser.
-    
-    The parser is created with a grammar and a 'k'. The LR parsing tables
-    are created during construction. If the grammar is not LR(k),
-    an InvalidGrammarException is raised.
-    
-    >>> not_a_lr0_grammar = Grammar(Rule('list'), Rule('list', 'item', 'list'))
-    >>> Parser(not_a_lr0_grammar, k=0) # doctest: +ELLIPSIS
-    Traceback (most recent call last):
-        ...
-    InvalidGrammarError: LR(0) table conflict ...
-
-    >>> lr0_grammar = Grammar(
-    ...     Rule('list', action=lambda c: [] if c == None else [c]),
-    ...     Rule('list', 'list', 'item', action=lambda c, l, i: l + [i]))
-    >>> p = Parser(lr0_grammar, k=0)
-    >>> print p.grammar
-    list ::= <empty>
-    list ::= list item
-    
-    The method 'parse' will accept an iterable of tokens (which are arbitrary objects)
-    and an extraction function. The extraction function should return the terminal symbol
-    associated with the token. By default, the extraction function is an identity. Whenever
-    the parser reduces a word to a non-terminal, an action associated with the reduction rule
-    is executed. This way Abstract Syntax Trees or other objects can be constructed.
-    The parse method returns the result of an action associated with the topmost reduction rule.
-    
-    Optionally, the 'parse' function will accept a 'context' keyword argument.
-    This is passed to an action when reduction occurs. By default, context is None.
-    
-    >>> p.parse(())
-    []
-    >>> p.parse(('item', 'item', 'item', 'item'))
-    ['item', 'item', 'item', 'item']
-    >>> p.parse('spam', extract=lambda x: 'item')
-    ['s', 'p', 'a', 'm']
-    >>> p.parse('gg', extract=lambda x: 'item', context='e')
-    ['e', 'g', 'g']
-    
-    If an error occurs during parsing, a ParsingError is raised.
-    
-    >>> p.parse('spam')
-    Traceback (most recent call last):
-        ...
-    ParsingError: Unexpected input token: 's'
-    """
-    
-    def __init__(self, grammar, k, keep_states=False):
-        
-        if grammar.root() == None:
-            raise InvalidGrammarError('There must be at least one rule in the grammar.')
-            
-        self.grammar = Grammar(*grammar)
-        self.k = k
-        
-        # Augment the grammar with a special rule: 'S -> R',
-        # where S is a new non-terminal (in this case '').
-        aug_grammar = Grammar(Rule('', grammar.root()), *grammar)
-            
-        first = First(aug_grammar, k)
-        
-        def _close_itemset(itemset):
-            i = 0
-            while i < len(itemset):
-                curitem = itemset[i]
-                
-                for next_lookahead in curitem.next_lookaheads(first):
-                    for next_rule in aug_grammar.rules(curitem.next_token()):
-                        newitem = _Item(next_rule, 0, next_lookahead)
-                        if newitem not in itemset:
-                            itemset.append(newitem)
-                
-                i += 1
-                
-        def _goto(itemset, symbol):
-            res = []
-            for item in itemset:
-                if item.next_token() != symbol:
-                    continue
-                    
-                res.append(_Item(item.rule, item.index + 1, item.lookahead))
-                
-            _close_itemset(res)
-            return set(res)
-        
-        itemset = [_Item(aug_grammar[0], 0, ())]
-        _close_itemset(itemset)
-        
-        states = [set(itemset)]
-        
-        goto_table = {}
-        
-        done = False
-        while not done:
-            done = True
-            
-            i = 0
-            while i < len(states):
-                itemset = states[i]
-                
-                for symbol in aug_grammar.symbols():
-                    newstate = _goto(itemset, symbol)
-                    if len(newstate) == 0:
-                        continue
-                        
-                    for j, state in enumerate(states):
-                        if newstate == state:
-                            goto_table[i, symbol] = j
-                            break
-                    else:
-                        goto_table[i, symbol] = len(states)
-                        states.append(newstate)
-                        done = False
-                
-                i += 1
-        
-        action_table = {}
-        accepting_state = None
-        
-        def add_action(state_id, lookahead, action, item):
-            key = (state_id, lookahead)
-            if key in action_table and action_table[key] != action:
-                raise InvalidGrammarError('LR(%i) table conflict at %s: actions %s, %s trying to add %s' % (k, key, action_table[key], action, item))
-            action_table[key] = action
-        
-        for state_id, state in enumerate(states):
-            for item in state:
-                nt = item.next_token()
-                if nt == None:
-                    if item.rule.left == '':
-                        accepting_state = state_id
-                        add_action(state_id, item.lookahead, ('shift',), item)
-                    else:
-                        add_action(state_id, item.lookahead, ('reduce', item.rule), item)
-                elif aug_grammar.is_terminal(nt):
-                    for la in item.lookaheads(first):
-                        add_action(state_id, la, ('shift',), item)
-        
-        assert accepting_state != None
-        self.goto = goto_table
-        self.action = action_table
-        self.accepting_state = accepting_state
-        self.k = k
-        
-        if keep_states:
-            self.states = states
-
-    def parse(self, sentence, context=None, extract=lambda x: x):
-        it = iter(sentence)
-        buf = []
-        while len(buf) < self.k:
-            try:
-                buf.append(it.next())
-            except StopIteration:
-                break
-                    
-        def get_shift_token():
-            if len(buf) == 0:
-                try:
-                    return it.next()
-                except StopIteration:
-                    return None
-            else:
-                res = buf.pop(0)
-                try:
-                    buf.append(it.next())
-                except StopIteration:
-                    pass
-                return res
-        
-        states = [0]
-        asts = []
-        while True:
-            state = states[-1]
-            
-            key = (state, tuple(extract(token) for token in buf))
-            if key not in self.action:
-                raise ParsingError('Unexpected input token: %s' % repr(key[1]))
-            
-            action = self.action[key]
-            if action[0] == 'reduce':
-                rule = action[1]
-                
-                if len(rule.right) > 0:
-                    new_ast = rule.action(context, *asts[-len(rule.right):])
-                    del states[-len(rule.right):]
-                    del asts[-len(rule.right):]
-                else:
-                    new_ast = rule.action(context)
-                
-                states.append(self.goto[states[-1], rule.left])
-                asts.append(new_ast)
-            else: # shift
-                tok = get_shift_token()
-                if tok == None:
-                    if state == self.accepting_state:
-                        assert len(asts) == 1
-                        return asts[0]
-                    else:
-                        raise ParsingError('Reached the end of file prematurely.')
-                
-                key = (state, extract(tok))
-                if key not in self.goto:
-                    raise ParsingError('Unexpected input token: %s' % repr(key[1]))
-                
-                states.append(self.goto[key])
-                asts.append(tok)
-
-class _Item:
-    def __init__(self, rule, index, lookahead):
-        self.rule = rule
-        self.index = index
-        self.lookahead = lookahead
-        
-        self.final = len(self.rule.right) <= self.index
-    
-    def __cmp__(self, other):
-        return cmp(
-            (self.rule, self.index, self.lookahead),
-            (other.rule, other.index, other.lookahead))
-    
-    def __hash__(self):
-        return hash((self.rule, self.index, self.lookahead))
-        
-    def __str__(self):
-        out = [self.rule.left, '::=']
-        out.extend(self.rule.right)
-        out.insert(self.index + 2, '.')
-        return ' '.join(out) + ' ' + str(self.lookahead)
-    
-    def __repr__(self):
-        return ''.join(("'", self.__str__(), "'"))
-    
-    def next_token(self):
-        return self.rule.right[self.index] if not self.final else None
-    
-    def next_lookaheads(self, first):
-        rule_suffix = self.rule.right[self.index + 1:]
-        word = rule_suffix + self.lookahead
-        return first(word)
-    
-    def lookaheads(self, first):
-        rule_suffix = self.rule.right[self.index:]
-        word = rule_suffix + self.lookahead
-        return first(word)
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
     
     A custom semantic action is associated in constructor.
     
-    >>> r = Rule('list', 'list', 'item', action=lambda c, l, i: l + [i])
+    >>> r = Rule('list', 'list', 'item', action=lambda self, l, i: l + [i])
     >>> r.action(None, [], 1)
     [1]
     """

File simple_lexer.py

+class _Classify:
+    def __init__(self):
+        self.quote = None
+        self.comment = False
+        
+    def __call__(self, ch):
+        if self.comment:
+            if ch == '\n':
+                self.comment = False
+            return False
+    
+        if ch == self.quote:
+            self.quote = None
+            return
+        if ch in '\'"':
+            self.quote = ch
+            return ''
+        if self.quote:
+            return 'ql'
+        
+        if ch == '#':
+            self.comment = True
+            return
+    
+        if ch.isspace():
+            return
+        
+        if ch.isalnum() or ch == '_':
+            return 'id'
+        
+        return ''
+
+def simple_lexer(input, classify=None):
+    """Provides a simple lexer.
+    
+    >>> list(simple_lexer('spam, eggs # comment'))
+    [('id', 'spam'), ',', ('id', 'eggs')]
+    >>> list(simple_lexer('monty python'))
+    [('id', 'monty'), ('id', 'python')]
+    >>> list(simple_lexer('  "spam and eggs"  '))
+    [('ql', 'spam and eggs')]
+    """
+    
+    classify = classify or _Classify()
+    
+    lit = ''
+    last_cl = None
+    for ch in input:
+        cl = classify(ch)
+        if cl == False:
+            continue
+        
+        if lit and cl != last_cl:
+            yield (last_cl, lit)
+            lit = ''
+        
+        last_cl = cl
+        if cl == '':
+            yield ch
+        elif cl:
+            lit += ch
+    if lit:
+        yield (last_cl, lit)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()