Source

Kong / kong / lexer.py

Full commit
""":mod:`kong.lexer` --- Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

"""
import numbers
import collections
import re


#: The :mod:`re` pattern that matches to tokens.
TOKEN_PATTERN = re.compile(ur'''
    (?P<string> "(?:[^"]|\\[^xuU]|\\x[0-9a-fA-F]{2}
                        |\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*" ) |
    (?P<arrow> <- ) |
    (?P<parenthesis> [()] ) |
    (?P<square_bracket> [[\]] ) |
    (?P<curly_bracket> [{}] ) |
    (?P<comma> , ) |
    (?P<period> \. ) |
    (?P<terminate> ; ) |
    (?P<colon> : ) |
    (?P<number> \d+ ) |
    (?P<identifier> (?: [^\s\d"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )
                    (?: [^\s"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )* ) |
    (?P<newline> (?: (?: [#] [^\n]* )? \r?\n )+ ) |
    (?P<space> [ \t]+ )
''', re.VERBOSE)


def tokenize(stream):
    r"""Makes tokens from input ``stream``.

    .. sourcecode:: pycon

       >>> t = lambda s: list(tokenize(s))
       >>> t(u'a<-func  (123)')  # doctest: +NORMALIZE_WHITESPACE
       [kong.lexer.Token('identifier', u'a', 0),
        kong.lexer.Token('arrow', u'<-', 1),
        kong.lexer.Token('identifier', u'func', 3),
        kong.lexer.Token('space', u'  ', 7),
        kong.lexer.Token('parenthesis', u'(', 9),
        kong.lexer.Token('number', u'123', 10),
        kong.lexer.Token('parenthesis', u')', 13)]

    It supports streaming as well:

    .. sourcecode:: pycon

       >>> stream = [u'a(12', u'3)\nb<', u'-c * 123']
       >>> t(stream)  # doctest: +NORMALIZE_WHITESPACE
       [kong.lexer.Token('identifier', u'a', 0),
        kong.lexer.Token('parenthesis', u'(', 1),
        kong.lexer.Token('number', u'123', 2),
        kong.lexer.Token('parenthesis', u')', 5),
        kong.lexer.Token('newline', u'\n', 6),
        kong.lexer.Token('identifier', u'b', 7),
        kong.lexer.Token('arrow', u'<-', 8),
        kong.lexer.Token('identifier', u'c', 10),
        kong.lexer.Token('space', u' ', 11),
        kong.lexer.Token('identifier', u'*', 12),
        kong.lexer.Token('space', u' ', 13),
        kong.lexer.Token('number', u'123', 14)]

    :param stream: input stream
    :type stream: :class:`collections.Iterable`
    :returns: :class:`Token` list
    :rtype: :class:`collections.Iterable`

    """
    if isinstance(stream, basestring):
        stream = stream,
    elif not isinstance(stream, collections.Iterable):
        raise TypeError('stream must be iterable')
    def get_token(m):
        d = m.groupdict()
        for tag, string in d.iteritems():
            if string:
                return Token(str(tag), string, i)
    i = 0
    s = ''
    for chunk in stream:
        s += chunk
        while True:
            m = TOKEN_PATTERN.match(s)
            if m and len(s) > m.end():
                yield get_token(m)
                i += m.end()
                s = s[m.end():]
            else:
                break
    if s:
        m = TOKEN_PATTERN.match(s)
        if m:
            yield get_token(m)
        else:
            msg = 'invalid token({0}): {1!r}'.format(i, s[:10])
            raise SyntaxError(i, msg)



class Token(object):
    """A token that contains :attr:`tag`, :attr:`string` and :attr:`offset`.

    .. attribute:: tag

       (:class:`basestring`) The type of token e.g. ``'arrow'``, ``'colon'``.

    .. attribute:: string

       (:class:`basestring`) The token string.

    .. attribute:: offset

       (:class:`numbers.Integral`) The token offset.

    """

    __slots__ = 'tag', 'string', 'offset'

    def __init__(self, tag, string, offset):
        self.tag = tag
        self.string = string
        self.offset = offset

    def get_syntax_error(self, message=None):
        """Makes a :exc:`SyntaxError` with its :attr:`offset`.

        :param message: an optional error message
        :type message: :class:`basestring`
        :returns: an :exc:`SyntaxError` instance
        :rtype: :exc:`SyntaxError`

        """
        return SyntaxError(self.offset, message)

    def __str__(self):
        if isinstance(self.string, str):
            return self.string
        return str(self.string)

    def __unicode__(self):
        if isinstance(self.unicode, str):
            return self.string
        return unicode(self.string)

    def __repr__(self):
        cls = type(self)
        args = (cls.__module__, cls.__name__,
                self.tag, self.string, self.offset)
        return '{0}.{1}({2!r}, {3!r}, {4!r})'.format(*args)


class SyntaxError(ValueError, SyntaxError):
    """An exception that rises when the syntax is invalid."""

    #: (:class:`numbers.Integral`) The errored offset of the :attr:`string`.
    offset = None

    def __init__(self, offset, message=None):
        if not isinstance(offset, numbers.Integral):
            raise TypeError('offset must be an integer, not ' + repr(offset))
        super(SyntaxError, self).__init__(message)
        self.offset = offset

    def get_line(self, string):
        """Gets the errored line number from the code ``string``.

        :param string: code string
        :type string: :class:`basestring`
        :returns: 0-based line number
        :rtype: :class:`numbers.Integral`

        """
        return string.count(u'\n', 0, self.offset)

    def get_column(self, string):
        """Gets the errored column number of the :attr:`line` from the code
        ``string``.

        :param string: code string
        :type string: :class:`basestring`
        :returns: 0-based column number
        :rtype: :class:`numbers.Integral`

        """
        try:
            pos = string.rindex(u'\n', 0, self.offset)
        except ValueError:
            return 0
        return self.offset - pos - 1