Commits

Hong Minhee  committed 8ccbe54

Implemented some expression parser. (Intermediate commit.)

  • Participants
  • Parent commits 2395940

Comments (0)

Files changed (3)

 
     def __repr__(self):
         cls = type(self)
-        args = repr(self.arguments)[1:-1]
-        if args.endswith(','):
-            args = args[:-1]
-        vals = cls.__module__, cls.__name__, self.function, args
-        return '{0}.{1}({2!r}, [{3}])'.format(*vals)
+        vals = (cls.__module__, cls.__name__,
+                self.function, list(self.arguments))
+        return '{0}.{1}({2!r}, {3!r})'.format(*vals)
 
 
 class Attribute(Application):
         cls = type(self)
         f = u'{0}.{1}(operator={2!r}, operands={3!r})'
         return f.format(cls.__module__, cls.__name__,
-                        self.operator, self.operands)
+                        self.operator, list(self.operands))
 
 
 class Definition(Expression):
         blk = u'; '.join(itertools.imap(unicode, self.program))
         return u'({0}): {{ {1} }}'.format(params, blk)
 
+    def __repr__(self):
+        cls = type(self)
+        args = (cls.__module__, cls.__name__,
+                list(self.parameters), self.program)
+        return '{0}.{1}({2!r}, program={3!r})'.format(*args)
+
 
 class StringLiteral(Literal):
     """A string literal node.

File kong/lexer.py

 import numbers
 import collections
 import re
-try:
-    import cStringIO as StringIO
-except NameError:
-    import StringIO
 
 
 #: The :mod:`re` pattern that matches to tokens.
 TOKEN_PATTERN = re.compile(ur'''
-    (?P<string> "(?:[^"]|\\.)*" ) |
+    (?P<string> "(?:[^"]|\\[^xuU]|\\x[0-9a-fA-F]{2}
+                        |\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*" ) |
     (?P<arrow> <- ) |
     (?P<parenthesis> [()] ) |
     (?P<square_bracket> [[\]] ) |
                 return Token(str(tag), string, i)
     i = 0
     s = ''
-    buf = StringIO.StringIO()
     for chunk in stream:
         s += chunk
-        buf.write(chunk)
         while True:
             m = TOKEN_PATTERN.match(s)
             if m and len(s) > m.end():
             yield get_token(m)
         else:
             msg = 'invalid token({0}): {1!r}'.format(i, s[:10])
-            raise LexerError(buf.getvalue(), i, msg)
+            raise SyntaxError(i, msg)
 
 
 
         self.string = string
         self.offset = offset
 
+    def get_syntax_error(self, message=None):
+        """Makes a :exc:`SyntaxError` with its :attr:`offset`.
+
+        :param message: an optional error message
+        :type message: :class:`basestring`
+        :returns: an :exc:`SyntaxError` instance
+        :rtype: :exc:`SyntaxError`
+
+        """
+        return SyntaxError(self.offset, message)
+
     def __str__(self):
         if isinstance(self.string, str):
             return self.string
         return '{0}.{1}({2!r}, {3!r}, {4!r})'.format(*args)
 
 
-class LexerError(ValueError, SyntaxError):
-    """An exception that rises when the invalid token meets."""
-
-    #: (:class:`basestring`) The errored code string.
-    string = None
+class SyntaxError(ValueError, SyntaxError):
+    """An exception that rises when the syntax is invalid."""
 
     #: (:class:`numbers.Integral`) The errored offset of the :attr:`string`.
     offset = None
 
-    def __init__(self, string, offset, message=None):
-        if not isinstance(string, basestring):
-            raise TypeError('expected string, not ' + repr(string))
-        elif not isinstance(offset, numbers.Integral):
+    def __init__(self, offset, message=None):
+        if not isinstance(offset, numbers.Integral):
             raise TypeError('offset must be an integer, not ' + repr(offset))
-        super(LexerError, self).__init__(message)
-        self.string = string
+        super(SyntaxError, self).__init__(message)
         self.offset = offset
 
-    @property
-    def line(self):
-        """(:class:`numbers.Integral`) The errored line number.
-        Starts from 0.
+    def get_line(self, string):
+        """Gets the errored line number from the code ``string``.
+
+        :param string: code string
+        :type string: :class:`basestring`
+        :returns: 0-based line number
+        :rtype: :class:`numbers.Integral`
 
         """
-        return self.string.count(u'\n', 0, self.offset)
+        return string.count(u'\n', 0, self.offset)
 
-    @property
-    def column(self):
-        """(:class:`numbers.Integral`) The errored column number of
-        the :attr:`line`. Starts from 0.
+    def get_column(self, string):
+        """Gets the errored column number of the :attr:`line` from the code
+        ``string``.
+
+        :param string: code string
+        :type string: :class:`basestring`
+        :returns: 0-based column number
+        :rtype: :class:`numbers.Integral`
 
         """
         try:
-            pos = self.string.rindex(u'\n', 0, self.offset)
+            pos = string.rindex(u'\n', 0, self.offset)
         except ValueError:
             return 0
         return self.offset - pos - 1

File kong/parser.py

+""":mod:`kong.parser` --- Tofu parser
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+"""
+import collections
+import numbers
+import re
+from .lexer import Token, SyntaxError
+from .ast import (Expression, Identifier, Application, Attribute, Operator,
+                  IdentifierLocalDefinition, IdentifierAssignment,
+                  AttributeDefinition, ListLiteral, StringLiteral)
+
+
+__all__ = 'parse_expression',
+
+
+#: The :mod:`re` pattern that matches to string literal escape sequences.
+STRING_LITERAL_ESCAPE_SEQUENCE_PATTERN = re.compile(
+    r'\\(?P<literal>[^xuU])|\\x(?P<x>[0-9A-Fa-f]{2})|'
+    r'\\u(?P<u>[0-9A-Fa-f]{4})|\\U(?P<U>[0-9A-Fa-f]{8})'
+)
+
+#: (:class:`dict`) The table of string literal escape sequences.
+STRING_LITERAL_ESCAPE_SEQUENCE_TABLE = {
+    'a': '\a', 'b': '\b', 't': '\t', 'n': '\n',
+    'v': '\v', 'f': '\f', 'r': '\r'
+}
+
+
+class ParsedToken(collections.namedtuple('_ParsedToken', 'token expression')):
+    """The namedtuple that contains parsed :attr:`token` and its result
+    :attr:`expression`.
+
+    :param token: parsed token
+    :type token: :class:`~kong.lexer.Token`
+    :param expression: parsing result
+    :type expression: :class:`~kong.ast.Expression`
+
+    .. attribute:: token
+
+       (:class:`~kong.lexer.Token`) The parsed token.
+
+    .. attribute:: expression
+
+       (:class:`~kong.ast.Expression`) The parsing result.
+
+    """
+
+    def __new__(cls, token, expression):
+        if isinstance(token, cls):
+            token = token.token
+        if not isinstance(token, Token):
+            raise TypeError('token must be a kong.lexer.Token instance, not '
+                            + repr(token))
+        elif not isinstance(expression, Expression):
+            raise TypeError('expression must be a kong.ast.Expression '
+                            'instance, not ' + repr(expression))
+        return super(ParsedToken, cls).__new__(cls, token, expression)
+
+    @property
+    def offset(self):
+        """(:class:`numbers.Integral`) The :attr:`~kong.lexer.Token.offset`
+        of :attr:`token`.
+
+        """
+        return self.token.offset
+
+    @property
+    def tag(self):
+        """(:class:`basestring`) The :attr:`~kong.lexer.Token.tag` of
+        :attr:`token`.
+
+        """
+        return self.token.tag
+
+    @property
+    def string(self):
+        """(:class:`basestring`) The :attr:`~kong.lexer.Token.string` of
+        :attr:`token`.
+
+        """
+        return self.token.string
+
+    def isa(self, expression_type):
+        """Predicate method that checks whether :attr:`expression` is an
+        instance of ``expression_type``. It is equivalent to::
+
+            isinstance(parsed_token.expression, expression_type)
+
+        :param expression_type: a subtype of :class:`~kong.ast.Expression`,
+                                or iterable of subtypes
+        :type expression_type: :class:`type`, :class:`collections.Iterable`
+        :returns: whether :attr:`expression` is an instance of
+                  ``expression_type``
+        :rtype: :class:`bool`
+
+        """
+        if isinstance(expression_type, collections.Iterable):
+            return any(self.isa(t) for t in expression_type)
+        if not isinstance(expression_type, type):
+            raise TypeError('expected a type object, not ' +
+                            repr(expression_type))
+        elif not issubclass(expression_type, Expression):
+            raise TypeError('expected a subtype of kong.ast.Expression, not '
+                            + repr(expression_type))
+        return isinstance(self.expression, expression_type)
+
+
+def parse_expression(tokens):
+    """Parses an expression.
+
+    .. sourcecode:: pycon
+
+       >>> from kong.lexer import tokenize
+       >>> p = lambda s: parse_expression(tokenize(s))
+       >>> p('f()')
+       kong.ast.Application(kong.ast.Identifier(u'f'), [])
+       >>> p('f(a, f2())')  # doctest: +NORMALIZE_WHITESPACE
+       kong.ast.Application(kong.ast.Identifier(u'f'),
+           [kong.ast.Identifier(u'a'),
+            kong.ast.Application(kong.ast.Identifier(u'f2'), [])])
+       >>> p('b <- 1 + (a <- 2) * 3')  # doctest: +NORMALIZE_WHITESPACE
+       kong.ast.IdentifierLocalDefinition(lvalue=kong.ast.Identifier(u'b'),
+        rvalue=kong.ast.Operator(operator=kong.ast.Identifier(u'*'),
+         operands=[kong.ast.Operator(operator=kong.ast.Identifier(u'+'),
+          operands=[kong.ast.StringLiteral(1),
+           kong.ast.IdentifierLocalDefinition(lvalue=kong.ast.Identifier(u'a'),
+            rvalue=kong.ast.StringLiteral(2))]),
+          kong.ast.StringLiteral(3)]))
+       >>> p('(1 + 2).*')  # doctest: +NORMALIZE_WHITESPACE
+       kong.ast.Attribute(kong.ast.Operator(operator=kong.ast.Identifier(u'+'),
+         operands=[kong.ast.StringLiteral(1), kong.ast.StringLiteral(2)]),
+        attribute=kong.ast.Identifier(u'*'))
+       >>> p('[]')
+       kong.ast.ListLiteral([])
+       >>> p('[a, 1]')  # doctest: +NORMALIZE_WHITESPACE
+       kong.ast.ListLiteral([kong.ast.Identifier(u'a'),
+                             kong.ast.StringLiteral(1)])
+       >>> p('[1, [2, 3], +]')  # doctest: +NORMALIZE_WHITESPACE
+       kong.ast.ListLiteral([kong.ast.StringLiteral(1),
+           kong.ast.ListLiteral([kong.ast.StringLiteral(2),
+               kong.ast.StringLiteral(3)]),
+           kong.ast.Identifier(u'+')])
+
+    :param tokens: tokens to parse
+    :type tokens: :class:`collections.Iterable`
+    :returns: parsed expression
+    :rtype: :class:`kong.ast.Expression`
+    :raises: :exc:`kong.lexer.SyntaxError` for invalid syntax
+
+    """
+    buffer = []
+    def tag(token, *tags):
+        return isinstance(token, Token) and token.tag in tags
+    def rfind_token(tag, string=None):
+        if buffer:
+            i = len(buffer) - 1
+            while i >= 0:
+                t = buffer[i]
+                if (isinstance(t, Token) and t.tag == tag and
+                    (string is None or t.string == string)):
+                    return i
+                i -= 1
+        raise LookupError()
+    def commalist(i):
+        list_elements = []
+        element_buffer = []
+        for list_el_token in buffer[i + 1:] + [None]:
+            if (list_el_token is None or
+                list_el_token.tag == 'comma'):
+                if element_buffer:
+                    list_element = associate(element_buffer)
+                    if len(list_element) > 1:
+                        raise SyntaxError(
+                            list_element[1].offset,
+                            'unexpected token ' + repr(list_element[1].string)
+                        )
+                    list_elements.append(
+                        list_element[0].expression
+                    )
+                    element_buffer = []
+                elif list_el_token is not None:
+                    raise SyntaxError(list_el_token.offset,
+                                      'empty argument or element')
+            else:
+                element_buffer.append(list_el_token)
+        return list_elements
+    for token in tokens:
+        if not isinstance(token, Token):
+            raise TypeError('expected iterable of kong.lexer.Token, not of ' +
+                            repr(token))
+        if tag(token, 'identifier'):
+            ident = Identifier(token.string)
+            buffer.append(ParsedToken(token, ident))
+        elif tag(token, 'string', 'number'):
+            string = parse_string_literal(token)
+            buffer.append(ParsedToken(token, string))
+        elif tag(token, 'parenthesis'):
+            if token.string == u'(':
+                buffer.append(token)
+            else:
+                try:
+                    i = rfind_token('parenthesis', u'(')
+                except LookupError:
+                    raise SyntaxError(token.offset,
+                                      'unexpected closing parenthesis ")"')
+                else:
+                    if (i > 0 and isinstance(buffer[i - 1], ParsedToken) and
+                        (i < 2 or isinstance(buffer[i - 2], Token))):
+                        args = commalist(i)
+                        node = Application(buffer[i - 1].expression, args)
+                        parsed = ParsedToken(buffer[i - 1].token, node)
+                        buffer[i - 1:] = [parsed]
+                    else:
+                        buffer[i:] = associate(buffer[i + 1:])
+        elif tag(token, 'square_bracket'):
+            if token.string == u'[':
+                buffer.append(token)
+            else:
+                try:
+                    i = rfind_token('square_bracket', u'[')
+                except LookupError:
+                    raise SyntaxError(token.offset,
+                                      'unexpected closing square bracket "]"')
+                else:
+                    node = ListLiteral(commalist(i))
+                    buffer[i:] = [ParsedToken(buffer[i], node)]
+        elif tag(token, 'period'):
+            if not isinstance(buffer[-1], ParsedToken):
+                raise SyntaxError(token.offset, 'unexpected period "."')
+            buffer.append(token)
+        elif tag(token, 'arrow', 'comma'):
+            buffer.append(token)
+    result = associate(buffer)
+    if not result:
+        raise SyntaxError(0, 'nothing to parse')
+    elif len(result) > 1:
+        token = result[1]
+        raise SyntaxError(token.offset,
+                          'unexpected token {0!r}'.format(token.string))
+    return result[0][1]
+
+
+def parse_string_literal(token):
+    """Parses a string literal token.
+
+    :param token: a string token
+    :type token: :class:`kong.lexer.Token`
+    :returns: parsed string literal
+    :rtype: :class:`kong.ast.StringLiteral`
+
+    """
+    if not isinstance(token, Token):
+        raise TypeError('expected a kong.lexer.Token, not ' + repr(token))
+    elif token.tag == 'number':
+        return StringLiteral(unicode(token.string))
+    elif token.tag != 'string':
+        raise ValueError('expected string token, not ' + token.tag)
+    def repl(match):
+        c = match.group('literal')
+        if c:
+            try:
+                return STRING_LITERAL_ESCAPE_SEQUENCE_TABLE[c]
+            except KeyError:
+                return c
+        u = match.group('x') or match.group('u') or match.group('U')
+        return unichr(int(u, 16))
+    s = STRING_LITERAL_ESCAPE_SEQUENCE_PATTERN.sub(repl, token.string[1:-1])
+    return StringLiteral(unicode(s))
+
+
+def associate(buffer):
+    """Associates associative expressions from the given ``buffer``,
+    a sequence of :class:`ParsedToken` and :class:`~kong.lexer.Token` objects.
+
+    :param buffer: a sequence of :class:`ParsedToken` and
+                   :class:`~kong.lexer.Token` objects
+    :type buffer: :class:`collections.Sequence`
+    :returns: associated result
+    :rtype: :class:`collections.Sequence`
+
+    """
+    buffer = associate_attributes(buffer)
+    buffer = associate_operator(buffer)
+    buffer = associate_definitions(buffer)
+    return buffer
+
+
+def associate_attributes(buffer):
+    """Associates :class:`~kong.ast.Attribute` expressions from the given
+    ``buffer``, a sequence of :class:`ParsedToken` and
+    :class:`~kong.lexer.Token` objects.
+
+    :param buffer: a sequence of :class:`ParsedToken` and
+                   :class:`~kong.lexer.Token` objects
+    :type buffer: :class:`collections.Sequence`
+    :returns: associated result
+    :rtype: :class:`collections.Sequence`
+
+    """
+    if not isinstance(buffer, collections.Sequence):
+        raise TypeError('buffer must be collections.Sequence, not ' +
+                        repr(buffer))
+    result = []
+    length = len(buffer)
+    skipto = -1
+    for i, token in enumerate(buffer):
+        if i <= skipto:
+            continue
+        elif isinstance(token, Token) and token.tag == 'period':
+            if i < 1 or i >= length - 1:
+                raise SyntaxError(token.offset, 'unexpected period "."')
+            prev = result[-1]
+            next = buffer[i + 1]
+            if not (isinstance(prev, ParsedToken) and
+                    isinstance(next, ParsedToken) and
+                    next.isa(Identifier)):
+                raise SyntaxError(token.offset, 'unexpected period "."')
+            del result[-1]
+            node = Attribute(prev.expression, attribute=next.expression)
+            parsed = ParsedToken(token, node)
+            result.append(parsed)
+            skipto = i + 1
+        else:
+            result.append(token)
+    return result
+
+
+def associate_operator(buffer):
+    """Associates :class:`~kong.ast.Operator` expressions from the given
+    ``buffer``, a sequence of :class:`ParsedToken` and
+    :class:`~kong.lexer.Token` objects.
+
+    :param buffer: a sequence of :class:`ParsedToken` and
+                   :class:`~kong.lexer.Token` objects
+    :type buffer: :class:`collections.Sequence`
+    :returns: associated result
+    :rtype: :class:`collections.Sequence`
+
+    """
+    if not isinstance(buffer, collections.Sequence):
+        raise TypeError('buffer must be collections.Sequence, not ' +
+                        repr(buffer))
+    result = []
+    length = len(buffer)
+    skipto = -1
+    for i, token in enumerate(buffer):
+        if i <= skipto:
+            continue
+        elif (isinstance(token, ParsedToken) and token.isa(Identifier) and
+              0 < i < length - 1):
+            prev = result[-1]
+            next = buffer[i + 1]
+            if not (isinstance(prev, ParsedToken) and
+                    isinstance(next, ParsedToken)):
+                raise SyntaxError(token.offset,
+                                  'unexpected identifier ' +
+                                  repr(token.string))
+            operator = Operator(operator=token.expression,
+                                operands=(prev.expression, next.expression))
+            result[-1] = ParsedToken(token, operator)
+            skipto = i + 1
+        else:
+            result.append(token)
+    return result
+
+
+def associate_definitions(buffer):
+    """Associates :class:`~kong.ast.Definition` expressions from the given
+    ``buffer``, a sequence of :class:`ParsedToken` and
+    :class:`~kong.lexer.Token` objects.
+
+    :param buffer: a sequence of :class:`ParsedToken` and
+                   :class:`~kong.lexer.Token` objects
+    :type buffer: :class:`collections.Sequence`
+    :returns: associated result
+    :rtype: :class:`collections.Sequence`
+
+    """
+    if not isinstance(buffer, collections.Sequence):
+        raise TypeError('buffer must be collections.Sequence, not ' +
+                        repr(buffer))
+    result = []
+    length = len(buffer)
+    skipto = -1
+    for i, token in enumerate(buffer):
+        if i <= skipto:
+            continue
+        elif isinstance(token, Token) and token.tag == 'arrow':
+            if i < 1 or i >= length - 1:
+                raise SyntaxError(token.offset, 'unexpected arrow "<-"')
+            prev = result[-1]
+            next = buffer[i + 1]
+            if not (isinstance(prev, ParsedToken) and
+                    prev.isa((Identifier, Attribute)) and
+                    isinstance(next, ParsedToken)):
+                raise SyntaxError(token.offset, 'unexpected arrow "<-"')
+            elif prev.isa(Attribute):
+                devnodecls = AttributeDefinition
+                skip = 1
+            elif prev.isa(Identifier):
+                if (i > 1 and isinstance(result[-2], Token) and
+                    result[-2].tag == 'period'):
+                    defnodecls = IdentifierAssignment
+                    skip = 2
+                else:
+                    defnodecls = IdentifierLocalDefinition
+                    skip = 1
+                del result[-skip:]
+                skipto = i + skip
+            node = defnodecls(lvalue=prev.expression, rvalue=next.expression)
+            parsed = ParsedToken(token, node)
+            result.append(parsed)
+        else:
+            result.append(token)
+    return result
+