Anonymous avatar Anonymous committed 4af2e44

Use a fixed number instead of sys.maxint.
Change pattern repr.
Add some pattern optimizations.
Allow {N} in pattern syntax.
Refactor parser driver to allow a different source of tokens than tokenize.
Add a filter for tokenize that removes significant whitespace
(so it can be insignificant for patterns).
Close the stream from which the grammar is read if we opened it.

Comments (0)

Files changed (7)

PatternGrammar.txt

 
 NegatedUnit: 'not' (STRING | NAME [Details] | '(' Alternatives ')')
 
-Repeater: '*' | '+' | '{' NUMBER ',' NUMBER '}'
+Repeater: '*' | '+' | '{' NUMBER [',' NUMBER] '}'
 
 Details: '<' Alternatives '>'
 
 # Tree matching patterns
 pat_compile = patcomp.PatternCompiler().compile_pattern
-p_apply = pat_compile("""(
+p_apply = pat_compile("""
 power< 'apply'
     trailer<
         '('
         ')'
     >
 >
-)"""
+"""
     )
 
 
 
 # Tree matching pattern
 pat_compile = patcomp.PatternCompiler().compile_pattern
-p_has_key_top = pat_compile("""(
+p_has_key_top = pat_compile("""
 power<
     before=any+
     trailer< '.' 'has_key' >
     >
     after=any*
 >
-)""")
+""")
 
 def fix_has_key(node):
     results = {}
 
 __author__ = "Guido van Rossum <guido@python.org>"
 
-# Python tokens
-import sys
+# Python imports
 import token
+import tokenize
 
 # Fairly local imports
 from pgen2 import driver
             setattr(self, name, grammar.symbol2number[name])
 
 
+def tokenize_wrapper(input):
+    """Tokenizes a string suppressing significant whitespace."""
+    skip = (token.NEWLINE, token.INDENT, token.DEDENT)
+    tokens = tokenize.generate_tokens(driver.generate_lines(input).next)
+    for quintuple in tokens:
+        type, value, start, end, line_text = quintuple
+        if type not in skip:
+            yield quintuple
+
+
 class PatternCompiler(object):
 
     def __init__(self, grammar_file="PatternGrammar.txt"):
 
     def compile_pattern(self, input, debug=False):
         """Compiles a pattern string to a nested pytree.*Pattern object."""
-        root = self.driver.parse_string(input, debug=debug)
+        tokens = tokenize_wrapper(input)
+        root = self.driver.parse_tokens(tokens, debug=debug)
         return self.compile_node(root)
 
     def compile_node(self, node):
 
         This is one big switch on the node type.
         """
-        # XXX Leave the optimizations to later
+        # XXX Optimize certain Wildcard-containing-Wildcard patterns
+        # that can be merged
         if node.type == self.syms.Matcher:
             node = node.children[0] # Avoid unneeded recursion
 
         if node.type == self.syms.Alternatives:
             # Skip the odd children since they are just '|' tokens
             alts = [self.compile_node(ch) for ch in node.children[::2]]
-            return pytree.WildcardPattern([[a] for a in alts], min=1, max=1)
+            if len(alts) == 1:
+                return alts[0]
+            p = pytree.WildcardPattern([[a] for a in alts], min=1, max=1)
+            return p.optimize()
 
         if node.type == self.syms.Alternative:
             units = [self.compile_node(ch) for ch in node.children]
-            return pytree.WildcardPattern([units], min=1, max=1)
+            if len(units) == 1:
+                return units[0]
+            p = pytree.WildcardPattern([units], min=1, max=1)
+            return p.optimize()
 
         if node.type == self.syms.NegatedUnit:
             pattern = self.compile_basic(node.children[1:])
-            return pytree.NegatedPattern(pattern)
+            p = pytree.NegatedPattern(pattern)
+            return p.optimize()
 
         assert node.type == self.syms.Unit
 
             child = children[0]
             if child.type == token.STAR:
                 min = 0
-                max = sys.maxint
+                max = pytree.HUGE
             elif child.type == token.PLUS:
                 min = 1
-                max = sys.maxint
+                max = pytree.HUGE
+            elif child.type == token.LBRACE:
+                assert children[-1].type == token.RBRACE
+                assert  len(children) in (3, 5)
+                min = max = self.get_int(children[1])
+                if len(children) == 5:
+                    max = self.get_int(children[3])
             else:
-                assert len(children) == 5
-                assert child.type == token.LBRACE
-                min = self.get_int(children[1])
-                max = self.get_int(children[3])
-            pattern = pytree.WildcardPattern([[pattern]], min=min, max=max)
+                assert False
+            if min != 1 or max != 1:
+                pattern = pattern.optimize()
+                pattern = pytree.WildcardPattern([[pattern]], min=min, max=max)
+
         if name is not None:
             pattern.name = name
-        return pattern
+        return pattern.optimize()
 
     def compile_basic(self, nodes, repeat=None):
         # Compile STRING | NAME [Details] | (...) | [...]
         return pytree.Leaf(type, value, context=context)
 
 
-def test():
+_SAMPLE = """(a=(power< ('apply' trailer<'(' b=(not STRING) ')'> ) >){1})
+{1,1}"""
+
+
+def _test():
     pc = PatternCompiler()
-    pat = pc.compile_pattern("a=power< 'apply' trailer<'(' b=(not STRING) ')'> >")
+    pat = pc.compile_pattern(_SAMPLE)
     print pat
 
 
 if __name__ == "__main__":
-    test()
+    _test()
 from pgen2 import parse
 from pgen2 import grammar
 
+
 class Driver(object):
 
     def __init__(self, grammar, convert=None, logger=None):
         self.logger = logger
         self.convert = convert
 
-    def parse_stream_raw(self, stream, debug=False):
-        """Parse a stream and return the concrete syntax tree."""
+    def parse_tokens(self, tokens, debug=False):
+        """Parse a series of tokens and return the syntax tree."""
+        # XXX Move the prefix computation into a wrapper around tokenize.
         p = parse.Parser(self.grammar, self.convert)
         p.setup()
         lineno = 1
         column = 0
         type = value = start = end = line_text = None
         prefix = ""
-        for quintuple in tokenize.generate_tokens(stream.readline):
+        for quintuple in tokens:
             type, value, start, end, line_text = quintuple
             if start != (lineno, column):
                 assert (lineno, column) <= start, ((lineno, column), start)
             raise parse.ParseError("incomplete input", t, v, x)
         return p.rootnode
 
+    def parse_stream_raw(self, stream, debug=False):
+        """Parse a stream and return the syntax tree."""
+        tokens = tokenize.generate_tokens(stream.readline)
+        return self.parse_tokens(tokens, debug)
+
     def parse_stream(self, stream, debug=False):
         """Parse a stream and return the syntax tree."""
         return self.parse_stream_raw(stream, debug)
 
     def parse_string(self, text, debug=False):
         """Parse a string and return the syntax tree."""
-        from StringIO import StringIO
-        stream = StringIO(text)
-        return self.parse_stream(stream, debug)
+        tokens = tokenize.generate_tokens(generate_lines(text).next)
+        return self.parse_tokens(tokens, debug)
+
+
+def generate_lines(text):
+    """Generator that behaves like readline without using StringIO."""
+    for line in text.splitlines(True):
+        yield line
+    while True:
+        yield ""
+
 
 def load_grammar(gt="Grammar.txt", gp=None,
                  save=True, force=False, logger=None):
         g.load(gp)
     return g
 
+
 def _newer(a, b):
     """Inquire whether file a was written since file b."""
     if not os.path.exists(a):
 class ParserGenerator(object):
 
     def __init__(self, filename, stream=None):
+        close_stream = None
         if stream is None:
             stream = open(filename)
+            close_stream = stream.close
         self.filename = filename
         self.stream = stream
         self.generator = tokenize.generate_tokens(stream.readline)
         self.gettoken() # Initialize lookahead
         self.dfas, self.startsymbol = self.parse()
+        if close_stream is not None:
+            close_stream()
         self.first = {} # map from symbol name to set of tokens
         self.addfirstsets()
 
 
 __author__ = "Guido van Rossum <guido@python.org>"
 
-import sys
+
+HUGE = 0x7FFFFFFF  # maximum repeat count, default max
 
 
 class Base(object):
         return object.__new__(cls, *args, **kwds)
 
     def __repr__(self):
-        return "%s(%s)" % (self.__class__.__name__,
-                           ", ".join("%s=%r" % (x, getattr(self, x))
-                                     for x in ("type", "content", "name")
-                                     if getattr(self, x) is not None))
+        args = [self.type, self.content, self.name]
+        while args and args[-1] is None:
+            del args[-1]
+        return "%s(%s)" % (self.__class__.__name__, ", ".join(map(repr, args)))
+
+    def optimize(self):
+        """A subclass can define this as a hook for optimizations.
+
+        Returns either self or another node with the same effect.
+        """
+        return self
 
     def match(self, node, results=None):
         """Does this pattern exactly match a node?
     except it always uses non-greedy matching.
     """
 
-    def __init__(self, content=None, min=0, max=sys.maxint, name=None):
+    def __init__(self, content=None, min=0, max=HUGE, name=None):
         """Initializer.
 
         Args:
                      if absent, matches one node;
                      if present, each subsequence is an alternative [*]
             min: optinal minumum number of times to match, default 0
-            max: optional maximum number of times tro match, default sys.maxint
+            max: optional maximum number of times tro match, default HUGE
             name: optional name assigned to this match
 
         [*] Thus, if content is [[a, b, c], [d, e], [f, g, h]] this is
             If content is not None, replace the dot with the parenthesized
             list of alternatives, e.g. (a b c | d e | f g h)*
         """
-        assert 0 <= min <= max <= sys.maxint, (min, max)
+        assert 0 <= min <= max <= HUGE, (min, max)
         if content is not None:
             content = tuple(map(tuple, content))  # Protect against alterations
             # Check sanity of alternatives
         self.max = max
         self.name = name
 
+    def optimize(self):
+        """Optimize certain stacked wildcard patterns."""
+        subpattern = None
+        if (self.content is not None and
+            len(self.content) == 1 and len(self.content[0]) == 1):
+            subpattern = self.content[0][0]
+        if self.min == 1 and self.max == 1:
+            if self.content is None:
+                return NodePattern(name=self.name)
+            if subpattern is not None and  self.name == subpattern.name:
+                return subpattern.optimize()
+        if (self.min <= 1 and isinstance(subpattern, WildcardPattern) and
+            subpattern.min <= 1 and self.name == subpattern.name):
+            return WildcardPattern(subpattern.content,
+                                   self.min*subpattern.min,
+                                   self.max*subpattern.max,
+                                   subpattern.name)
+        return self
+
     def match(self, node, results=None):
         """Does this pattern exactly match a node?"""
         return self.match_seq([node], results)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.