Commits

Hong Minhee  committed 3468fbd

Streaming lexer.

  • Participants
  • Parent commits 2570b55

Comments (0)

Files changed (1)

File kong/lexer.py

 import numbers
 import collections
 import re
+try:
+    import cStringIO as StringIO
+except NameError:
+    import StringIO
 
 
 #: The :mod:`re` pattern that matches to tokens.
     (?P<number> \d+ ) |
     (?P<identifier> (?: [^\s\d"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )
                     (?: [^\s"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )* ) |
-    (?P<newline> (?: [#] [^\n]* \r?\n )+ ) |
+    (?P<newline> (?: (?: [#] [^\n]* )? \r?\n )+ ) |
     (?P<space> [ \t]+ )
 ''', re.VERBOSE)
 
 
-def tokenize(string):
-    """Makes tokens from input ``string``.
+def tokenize(stream):
+    r"""Makes tokens from input ``stream``.
 
     .. sourcecode:: pycon
 
         kong.lexer.Token(u'number', u'123', 10),
         kong.lexer.Token(u'parenthesis', u')', 13)]
 
-    :param string: input string
-    :type string: :class:`basestring`
+    It supports streaming as well:
+
+    .. sourcecode:: pycon
+
+       >>> stream = [u'a(12', u'3)\nb<', u'-c * 123']
+       >>> t(stream)  # doctest: +NORMALIZE_WHITESPACE
+       [kong.lexer.Token(u'identifier', u'a', 0),
+        kong.lexer.Token(u'parenthesis', u'(', 1),
+        kong.lexer.Token(u'number', u'123', 2),
+        kong.lexer.Token(u'parenthesis', u')', 5),
+        kong.lexer.Token(u'newline', u'\n', 6),
+        kong.lexer.Token(u'identifier', u'b', 7),
+        kong.lexer.Token(u'arrow', u'<-', 8),
+        kong.lexer.Token(u'identifier', u'c', 10),
+        kong.lexer.Token(u'space', u' ', 11),
+        kong.lexer.Token(u'identifier', u'*', 12),
+        kong.lexer.Token(u'space', u' ', 13),
+        kong.lexer.Token(u'number', u'123', 14)]
+
+    :param stream: input stream
+    :type stream: :class:`collections.Iterable`
     :returns: :class:`Token` list
     :rtype: :class:`collections.Iterable`
 
     """
-    if not isinstance(string, basestring):
-        raise TypeError('expected string, not ' + repr(string))
-    s = string
-    i = 0
-    while s:
-        m = TOKEN_PATTERN.match(s)
-        if not m:
-            raise LexerError(string, i, 'invalid token')
+    if isinstance(stream, basestring):
+        stream = stream,
+    elif not isinstance(stream, collections.Iterable):
+        raise TypeError('stream must be iterable')
+    def get_token(m):
         d = m.groupdict()
         for tag, string in d.iteritems():
             if string:
-                yield Token(tag, string, i)
+                return Token(tag, string, i)
+    i = 0
+    s = ''
+    buf = StringIO.StringIO()
+    for chunk in stream:
+        s += chunk
+        buf.write(chunk)
+        while True:
+            m = TOKEN_PATTERN.match(s)
+            if m and len(s) > m.end():
+                yield get_token(m)
+                i += m.end()
+                s = s[m.end():]
+            else:
                 break
-        i += m.end()
-        s = s[m.end():]
+    if s:
+        m = TOKEN_PATTERN.match(s)
+        if m:
+            yield get_token(m)
+        else:
+            msg = 'invalid token({0}): {1!r}'.format(i, s[:10])
+            raise LexerError(buf.getvalue(), i, msg)
+
 
 
 class Token(object):