Source

Kong / kong / lexer.py

""":mod:`kong.lexer` --- Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

"""
import numbers
import collections
import re
try:
    import cStringIO as StringIO
except NameError:
    import StringIO


#: The :mod:`re` pattern that matches to tokens.
TOKEN_PATTERN = re.compile(ur'''
    (?P<string> "(?:[^"]|\\.)*" ) |
    (?P<arrow> <- ) |
    (?P<parenthesis> [()] ) |
    (?P<square_bracket> [[\]] ) |
    (?P<curly_bracket> [{}] ) |
    (?P<comma> , ) |
    (?P<period> \. ) |
    (?P<terminate> ; ) |
    (?P<colon> : ) |
    (?P<number> \d+ ) |
    (?P<identifier> (?: [^\s\d"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )
                    (?: [^\s"(){}[\],.;:#<] | < [^\s"(){}[\],.;:#-] | < $ )* ) |
    (?P<newline> (?: (?: [#] [^\n]* )? \r?\n )+ ) |
    (?P<space> [ \t]+ )
''', re.VERBOSE)


def tokenize(stream):
    r"""Makes tokens from input ``stream``.

    .. sourcecode:: pycon

       >>> t = lambda s: list(tokenize(s))
       >>> t(u'a<-func  (123)')  # doctest: +NORMALIZE_WHITESPACE
       [kong.lexer.Token(u'identifier', u'a', 0),
        kong.lexer.Token(u'arrow', u'<-', 1),
        kong.lexer.Token(u'identifier', u'func', 3),
        kong.lexer.Token(u'space', u'  ', 7),
        kong.lexer.Token(u'parenthesis', u'(', 9),
        kong.lexer.Token(u'number', u'123', 10),
        kong.lexer.Token(u'parenthesis', u')', 13)]

    It supports streaming as well:

    .. sourcecode:: pycon

       >>> stream = [u'a(12', u'3)\nb<', u'-c * 123']
       >>> t(stream)  # doctest: +NORMALIZE_WHITESPACE
       [kong.lexer.Token(u'identifier', u'a', 0),
        kong.lexer.Token(u'parenthesis', u'(', 1),
        kong.lexer.Token(u'number', u'123', 2),
        kong.lexer.Token(u'parenthesis', u')', 5),
        kong.lexer.Token(u'newline', u'\n', 6),
        kong.lexer.Token(u'identifier', u'b', 7),
        kong.lexer.Token(u'arrow', u'<-', 8),
        kong.lexer.Token(u'identifier', u'c', 10),
        kong.lexer.Token(u'space', u' ', 11),
        kong.lexer.Token(u'identifier', u'*', 12),
        kong.lexer.Token(u'space', u' ', 13),
        kong.lexer.Token(u'number', u'123', 14)]

    :param stream: input stream
    :type stream: :class:`collections.Iterable`
    :returns: :class:`Token` list
    :rtype: :class:`collections.Iterable`

    """
    if isinstance(stream, basestring):
        stream = stream,
    elif not isinstance(stream, collections.Iterable):
        raise TypeError('stream must be iterable')
    def get_token(m):
        d = m.groupdict()
        for tag, string in d.iteritems():
            if string:
                return Token(tag, string, i)
    i = 0
    s = ''
    buf = StringIO.StringIO()
    for chunk in stream:
        s += chunk
        buf.write(chunk)
        while True:
            m = TOKEN_PATTERN.match(s)
            if m and len(s) > m.end():
                yield get_token(m)
                i += m.end()
                s = s[m.end():]
            else:
                break
    if s:
        m = TOKEN_PATTERN.match(s)
        if m:
            yield get_token(m)
        else:
            msg = 'invalid token({0}): {1!r}'.format(i, s[:10])
            raise LexerError(buf.getvalue(), i, msg)



class Token(object):
    """A token that contains :attr:`tag`, :attr:`string` and :attr:`offset`.

    .. attribute:: tag

       (:class:`basestring`) The type of token e.g. ``'arrow'``, ``'colon'``.

    .. attribute:: string

       (:class:`basestring`) The token string.

    .. attribute:: offset

       (:class:`numbers.Integral`) The token offset.

    """

    __slots__ = 'tag', 'string', 'offset'

    def __init__(self, tag, string, offset):
        self.tag = tag
        self.string = string
        self.offset = offset

    def __str__(self):
        if isinstance(self.string, str):
            return self.string
        return str(self.string)

    def __unicode__(self):
        if isinstance(self.unicode, str):
            return self.string
        return unicode(self.string)

    def __repr__(self):
        cls = type(self)
        args = (cls.__module__, cls.__name__,
                self.tag, self.string, self.offset)
        return '{0}.{1}({2!r}, {3!r}, {4!r})'.format(*args)


class LexerError(ValueError, SyntaxError):
    """An exception that rises when the invalid token meets."""

    #: (:class:`basestring`) The errored code string.
    string = None

    #: (:class:`numbers.Integral`) The errored offset of the :attr:`string`.
    offset = None

    def __init__(self, string, offset, message=None):
        if not isinstance(string, basestring):
            raise TypeError('expected string, not ' + repr(string))
        elif not isinstance(offset, numbers.Integral):
            raise TypeError('offset must be an integer, not ' + repr(offset))
        super(LexerError, self).__init__(message)
        self.string = string
        self.offset = offset

    @property
    def line(self):
        """(:class:`numbers.Integral`) The errored line number.
        Starts from 0.

        """
        return self.string.count(u'\n', 0, self.offset)

    @property
    def column(self):
        """(:class:`numbers.Integral`) The errored column number of
        the :attr:`line`. Starts from 0.

        """
        try:
            pos = self.string.rindex(u'\n', 0, self.offset)
        except ValueError:
            return 0
        return self.offset - pos - 1