Commits

Hong Minhee committed 2570b55

Lexer.

  • Participants
  • Parent commits 789e54d

Comments (0)

Files changed (6)

File docs/kong.rst

       :maxdepth: 2
 
       kong/ast
+      kong/lexer
 

File docs/kong/lexer.rst

+
+.. automodule:: kong.lexer
+   :members:
+

File kong/__init__.py

-""":mod:`kong`
-~~~~~~~~~~~~~~
+""":mod:`kong` --- Tofu implementation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 """
 import numbers
 import collections
 import itertools
+import re
 
 
 class Node(object):
 
     __slots__ = ()
 
+    def __new__(cls, string, *args, **kwargs):
+        ident = unicode.__new__(cls, string, *args, **kwargs)
+        if ident == u'':
+            raise ValueError('identifier cannot be empty')
+        elif re.search(ur'<-|[\s.,:(){}]', ident) or ident.isdigit():
+            s = unicode(ident)
+            raise ValueError('invalid identifier: {0!r}'.format(s))
+        return ident
+
     def __repr__(self):
         cls = type(self)
         repr = unicode.__repr__(self)
         self.function = function
         self.arguments = tuple(args())
 
+    def __eq__(self, operand):
+        return (isinstance(operand, Application) and
+                self.function == operand.function and
+                self.arguments == operand.arguments)
+
+    def __ne__(self, operand):
+        return not (self == operand)
+
     def __unicode__(self):
         args = u', '.join(itertools.imap(unicode, self.arguments))
         return u'{0}({1})'.format(unicode(self.function), args)
         forms = a, self.operator, b
         return u' '.join(itertools.imap(unicode, forms))
 
+    def __repr__(self):
+        cls = type(self)
+        f = u'{0}.{1}(operator={2!r}, operands={3!r})'
+        return f.format(cls.__module__, cls.__name__,
+                        self.operator, self.operands)
+
 
 class Definition(Expression):
     """An abstract class for definition nodes.
     #: (:class:`Expression`) Rvalue expression.
     rvalue = NotImplemented
 
+    def __eq__(self, operand):
+        return (type(operand) is type(self) and
+                self.lvalue == operand.lvalue and
+                self.rvalue == operand.rvalue)
+
+    def __ne__(self, operand):
+        return not (self == operand)
+
     def __unicode__(self):
         args = self.lvalue, '<-', self.rvalue
         return u' '.join(itertools.imap(unicode, args))
         """
         return self.arguments[1]
 
+    def __eq__(self, operand):
+        if isinstance(operand, Definition):
+            return Definition.__eq__(self, operand)
+        return Application.__eq__(self, operand)
+
+    def __ne__(self, operand):
+        return not (self == operand)
+
 
 class Literal(Expression):
     """A literal node. It is an abstract class.
             program = Program(program)
         self.program = program
 
+    def __eq__(self, operand):
+        return (isinstance(operand, type(self)) and
+                self.program == operand.program)
+
+    def __ne__(self, operand):
+        return not (self == operand)
+
     def __unicode__(self):
         if self.program:
             blk = u'; '.join(itertools.imap(unicode, self.program))
         self.parameters = tuple(parameters)
         self.program = program
 
+    def __eq__(self, operand):
+        return (isinstance(operand, type(self)) and
+                self.parameters == operand.parameters and
+                self.program == operand.program)
+
+    def __ne__(self, operand):
+        return not (self == operand)
+
     def __unicode__(self):
         params = u', '.join(itertools.imap(unicode, self.parameters))
         blk = u'; '.join(itertools.imap(unicode, self.program))
                             'not ' + repr(string))
         self.string = string if type(string) is unicode else unicode(string)
 
+    def __eq__(self, operand):
+        return (isinstance(operand, type(self)) and
+                self.string == operand.string)
+
+    def __ne__(self, operand):
+        return not (self == operand)
+
+    def __hash__(self):
+        return hash(self.string)
+
     def __unicode__(self):
         if self.string.isdigit():
             return self.string

File kong/lexer.py

+""":mod:`kong.lexer` --- Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+"""
+import numbers
+import collections
+import re
+
+
+#: The :mod:`re` pattern that matches to tokens.
+TOKEN_PATTERN = re.compile(ur'''
+    (?P<string> "(?:[^"]|\\.)*" ) |
+    (?P<arrow> <- ) |
+    (?P<parenthesis> [()] ) |
+    (?P<square_bracket> [[\]] ) |
+    (?P<curly_bracket> [{}] ) |
+    (?P<comma> , ) |
+    (?P<period> \. ) |
+    (?P<terminate> ; ) |
+    (?P<colon> : ) |
+    (?P<number> \d+ ) |
+    (?P<identifier> (?: [^\s\d"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )
+                    (?: [^\s"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )* ) |
+    (?P<newline> (?: [#] [^\n]* \r?\n )+ ) |
+    (?P<space> [ \t]+ )
+''', re.VERBOSE)
+
+
+def tokenize(string):
+    """Makes tokens from input ``string``.
+
+    .. sourcecode:: pycon
+
+       >>> t = lambda s: list(tokenize(s))
+       >>> t(u'a<-func  (123)')  # doctest: +NORMALIZE_WHITESPACE
+       [kong.lexer.Token(u'identifier', u'a', 0),
+        kong.lexer.Token(u'arrow', u'<-', 1),
+        kong.lexer.Token(u'identifier', u'func', 3),
+        kong.lexer.Token(u'space', u'  ', 7),
+        kong.lexer.Token(u'parenthesis', u'(', 9),
+        kong.lexer.Token(u'number', u'123', 10),
+        kong.lexer.Token(u'parenthesis', u')', 13)]
+
+    :param string: input string
+    :type string: :class:`basestring`
+    :returns: :class:`Token` list
+    :rtype: :class:`collections.Iterable`
+
+    """
+    if not isinstance(string, basestring):
+        raise TypeError('expected string, not ' + repr(string))
+    s = string
+    i = 0
+    while s:
+        m = TOKEN_PATTERN.match(s)
+        if not m:
+            raise LexerError(string, i, 'invalid token')
+        d = m.groupdict()
+        for tag, string in d.iteritems():
+            if string:
+                yield Token(tag, string, i)
+                break
+        i += m.end()
+        s = s[m.end():]
+
+
+class Token(object):
+    """A token that contains :attr:`tag`, :attr:`string` and :attr:`offset`.
+
+    .. attribute:: tag
+
+       (:class:`basestring`) The type of token e.g. ``'arrow'``, ``'colon'``.
+
+    .. attribute:: string
+
+       (:class:`basestring`) The token string.
+
+    .. attribute:: offset
+
+       (:class:`numbers.Integral`) The token offset.
+
+    """
+
+    __slots__ = 'tag', 'string', 'offset'
+
+    def __init__(self, tag, string, offset):
+        self.tag = tag
+        self.string = string
+        self.offset = offset
+
+    def __str__(self):
+        if isinstance(self.string, str):
+            return self.string
+        return str(self.string)
+
+    def __unicode__(self):
+        if isinstance(self.unicode, str):
+            return self.string
+        return unicode(self.string)
+
+    def __repr__(self):
+        cls = type(self)
+        args = (cls.__module__, cls.__name__,
+                self.tag, self.string, self.offset)
+        return '{0}.{1}({2!r}, {3!r}, {4!r})'.format(*args)
+
+
+class LexerError(ValueError, SyntaxError):
+    """An exception that rises when the invalid token meets."""
+
+    #: (:class:`basestring`) The errored code string.
+    string = None
+
+    #: (:class:`numbers.Integral`) The errored offset of the :attr:`string`.
+    offset = None
+
+    def __init__(self, string, offset, message=None):
+        if not isinstance(string, basestring):
+            raise TypeError('expected string, not ' + repr(string))
+        elif not isinstance(offset, numbers.Integral):
+            raise TypeError('offset must be an integer, not ' + repr(offset))
+        super(LexerError, self).__init__(message)
+        self.string = string
+        self.offset = offset
+
+    @property
+    def line(self):
+        """(:class:`numbers.Integral`) The errored line number.
+        Starts from 0.
+
+        """
+        return self.string.count(u'\n', 0, self.offset)
+
+    @property
+    def column(self):
+        """(:class:`numbers.Integral`) The errored column number of
+        the :attr:`line`. Starts from 0.
+
+        """
+        try:
+            pos = self.string.rindex(u'\n', 0, self.offset)
+        except ValueError:
+            return 0
+        return self.offset - pos - 1
+
       author='Hong Minhee',
       author_email='minhee' '@' 'dahlia.kr',
       packages=['kong'],
-      install_requires=['LEPL'],
       extras_require={'docs': ['Sphinx >=1.0']},
       test_suite='kongtests.suite',
       license='MIT License')