1. Eric Snow
  2. pylt

Commits

Eric Snow  committed f95d98a Draft

build out the pylt.tokenizing module

  • Participants
  • Parent commits 5f905d5
  • Branches default

Comments (0)

Files changed (4)

File pylt/parsing/python.py

View file
-"""
+"""Tools and APIs for parsing Python source code.
 
 Based on: http://docs.python.org/dev/reference/grammar.html
 ----
 
 """
 
-import io
+# XXX perhaps introduce transform_tokens(tokens) as an API that modifies
+# type and string of tokens as well as adding, removing, or rearranging them
+
+__all__ = [
+        'ParseError',
+        'ParseUnit', 'SimpleStatement', 'ComplexStatement',
+        'Parser',
+        ]
+
+
 import token
-from tokenize import tokenize as _tokenize, untokenize as _untokenize, NL
 
-
-def tokenize(source):
-    yield from _tokenize(io.BytesIO(source.encode('utf-8')).readline)
-
-
-def untokenize(tokens):
-    return _untokenize(tok[:2] for tok in tokens)
+from ..tokenizing import tokenize, untokenize
 
 
 #def get_statement(tokens):
     """Something went wrong while parsing."""
 
 
-def normalize_formatting(tokens):
-    """Yield the tokens with reasonable formatting.
-
-    The stream of tokens coming out of tokenize.tokenize() has the
-    potential of having poor whitespace placement.  This function
-    helps mitigate that effect.
-
-    """
-    # XXX this could most certainly be generalized.
-    raise NotImplementedError
-
-
 class ParseUnit:
 
     def __init__(self, tokens):

File pylt/tests/test_python_parser.py

View file
+import unittest
+import token
+
+from .test_tokenizing import SOURCE, SOURCE_UNNORMALIZED, TOKENS_RAW, TOKENS
+from ..parsing import python as parsing
+
+
+# XXX populate these!!!
+SIMPLE = [
+        ]
+
+COMPOUND = [
+        ]
+
+HEADER = [
+        ]
+
+BODY = [
+        ]
+
+
+class TestsForParseUnits(unittest.TestCase):
+
+    def test_base_get_tokens(self):
+        unit = parsing.ParseUnit(TOKENS)
+        tokens = list(unit.get_tokens())
+
+        self.assertEqual(tokens, TOKENS)
+
+    def test_simple_statement_get_tokens(self):
+        statement = parsing.SimpleStatement(SIMPLE)
+        tokens = list(statement.get_tokens())
+
+        self.assertEqual(tokens, SIMPLE)
+
+    def test_compound_statement_header(self):
+        statement = parsing.CompoundStatement(HEADER, BODY)
+        header = statement.header
+
+        self.assertEqual(header, HEADER)
+
+    def test_compound_statement_body(self):
+        statement = parsing.CompoundStatement(HEADER, BODY)
+        body = statement.body
+
+        self.assertEqual(body, BODY)
+
+    def test_compound_statement_get_tokens(self):
+        statement = parsing.CompoundStatement(HEADER, BODY)
+        tokens = list(statement.get_tokens())
+
+        self.assertEqual(tokens, COMPOUND)
+
+
+class ParserTests(unittest.TestCase):
+
+    def test_parse_source(self):
+        raise NotImplementedError
+
+    def test_parse_suite(self):
+        raise NotImplementedError
+
+    def test_parse_simple_statements(self):
+        raise NotImplementedError

File pylt/tests/test_tokenizing.py

View file
+import unittest
+from token import ENDMARKER, NAME
+from tokenize import TokenInfo, ENCODING, COMMENT, NL
+
+from ..tokenizing import (Token, BareToken, tokenize, untokenize, strip_tokens,
+                          format_source, normalize_source, normalize_tokens)
+
+
+SOURCE = b"""\
+# some test source
+"""
+
+SOURCE_UNNORMALIZED = b"""\
+# some test source
+"""
+
+TOKENS_RAW = [
+        TokenInfo(ENCODING, 'utf-8', (0, 0), (0, 0), ''),
+        TokenInfo(COMMENT, '# some test source',
+                  (1, 0), (1, 18), '# some test source\n'),
+        TokenInfo(NL, '\n',
+                  (1, 18), (1, 19), '# some test source\n'),
+        Token(ENDMARKER, '', (2, 0), (2, 0), ''),
+        ]
+
+TOKENS = [tok._replace(start=(0, 0), end=(0, 0), line='')
+          for tok in TOKENS_RAW]
+
+
+class TokenTests(unittest.TestCase):
+
+    # XXX need test for validating start/end/line
+
+    def test_token(self):
+        tok = Token(NAME, 'spam', (0, 0), (0, 3), 'spam')
+
+        self.assertEqual(tok, (NAME, 'spam', (0, 0), (0, 3), 'spam'))
+
+    def test_token_defaults(self):
+        tok = Token(NAME, 'spam')
+
+        self.assertEqual(tok, (NAME, 'spam', (0, 0), (0, 0), ''))
+
+    def test_token_defaults_explicit(self):
+        tok = Token(NAME, 'spam', None, None, None)
+
+        self.assertEqual(tok, (NAME, 'spam', (0, 0), (0, 0), ''))
+
+    def test_clear(self):
+        original = Token(NAME, 'spam', (0, 0), (0, 3), 'spam')
+        tok = original.clear()
+
+        self.assertEqual(tok, (NAME, 'spam', (0, 0), (0, 0), ''))
+
+    def test_strip(self):
+        original = Token(NAME, 'spam', (0, 0), (0, 3), 'spam')
+        tok = original.strip()
+
+        self.assertEqual(tok, (NAME, 'spam'))
+
+    def test_bare_token(self):
+        tok = BareToken(NAME, 'spam')
+
+        self.assertEqual(tok, (NAME, 'spam'))
+
+    def test_from_token(self):
+        original = Token(NAME, 'spam', (0, 0), (0, 3), 'spam')
+        tok = BareToken.from_token(original)
+
+        self.assertEqual(tok, (NAME, 'spam'))
+
+    def test_to_token(self):
+        original = BareToken(NAME, 'spam')
+        tok = original.to_token()
+
+        self.assertEqual(tok, (NAME, 'spam', (0, 0), (0, 0), ''))
+
+
+class TokenizerTests(unittest.TestCase):
+
+    maxDiff = None
+
+    def test_tokenize(self):
+        tokens = list(tokenize(SOURCE))
+
+        self.assertEqual(tokens, TOKENS)
+
+    def test_tokenize_utf8(self):
+        tokens = list(tokenize(SOURCE.decode('utf-8')))
+
+        self.assertEqual(tokens, TOKENS)
+
+    def test_tokenize_raw(self):
+        tokens = list(tokenize(SOURCE, raw=True))
+
+        self.assertEqual(tokens, TOKENS_RAW)
+
+    def test_untokenize(self):
+        source = untokenize(TOKENS)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    def test_untokenize_raw(self):
+        source = untokenize(TOKENS_RAW)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    def test_untokenize_without_formatting(self):
+        source = untokenize(TOKENS_RAW, formatter=None)
+
+        self.assertEqual(source, SOURCE)
+
+    def test_untokenize_explicitly_stripped(self):
+        formatter = strip_tokens
+        source = untokenize(TOKENS_RAW, formatter=formatter)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    def test_tokenize_roundtrip(self):
+        tokens = tokenize(SOURCE)
+        source = untokenize(tokens)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    def test_tokenize_roundtrip_raw(self):
+        tokens = tokenize(SOURCE, raw=True)
+        source = untokenize(tokens)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    def test_tokenize_roundtrip_without_formatting(self):
+        tokens = tokenize(SOURCE)
+        source = untokenize(tokens, formatter=None)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    def test_tokenize_roundtrip_raw_without_formatting(self):
+        tokens = tokenize(SOURCE, raw=True)
+        source = untokenize(tokens, formatter=None)
+
+        self.assertEqual(source, SOURCE)
+
+
+class FormatterTests(unittest.TestCase):
+
+    def test_strip_tokens(self):
+        tokens = list(strip_tokens(TOKENS_RAW))
+
+        self.assertEqual(tokens, TOKENS)
+
+    def test_format_source(self):
+        formatter = strip_tokens
+        source = format_source(SOURCE, formatter=formatter)
+
+        self.assertEqual(source, SOURCE_UNNORMALIZED)
+
+    @unittest.skip("not finished")
+    def test_normalize_tokens(self):
+        tokens = (normalize_tokens(TOKENS))
+
+        self.assertEqual(tokens, TOKENS_RAW)
+
+    @unittest.skip("not finished")
+    def test_normalize_source(self):
+        tokens = (normalize_source(TOKENS))
+
+        self.assertEqual(tokens, TOKENS_RAW)

File pylt/tokenizing.py

View file
+"""Tools and APIs for tokenizing source files.
+
+At the moment this module is very Python-specific.
+
+"""
+
+__all__ = ['Token', 'BareToken', 'strip_tokens', 'tokenize', 'untokenize',
+           'format_source', 'normalize_tokens', 'normalize_source']
+
+
+import io
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+from token import tok_name
+from tokenize import (tokenize as _tokenize, untokenize as _untokenize,
+                      TokenInfo, NL)
+
+
+class Token(TokenInfo):
+    """A tokenize.TokenInfo subclass with optional start, end, and line."""
+
+    @classmethod
+    def as_line(cls, type, string, lineno=0, indent=""):
+        """Return the corresponding token and a NL token."""
+        # XXX need to handle alternate line separators
+        # io.IncrementalNewlineDecoder?
+        base = len(indent)
+        linesep = "\n"
+
+        start = (lineno, base)
+        end = (lineno, base + len(string))
+        nl_end = (lineno, base + len(string) + len(linesep))
+        line = indent + string + linesep
+
+        tok = cls(type, string, start, end, line)
+        nl = cls(NL, linesep, end, nl_end, line)
+        return tok. nl
+
+    def __new__(cls, type, string, start=None, end=None, line=None):
+        # XXX None should be a legitimate value for start, end, and line
+        # XXX validate start < end and both within line
+        if start is None:
+            start = (0, 0)
+        if end is None:
+            end = (0, 0)
+        if line is None:
+            line = ""
+        return super().__new__(cls, type, string, start, end, line)
+
+    def clear(self):
+        # this may causes formatting discontinuities with a stream of tokens
+        return self._replace(start=(0, 0), end=(0, 0), line="")
+
+    def strip(self):
+        """Return the corresponding BareToken."""
+        return BareToken.from_token(self)
+
+
+class BareToken(namedtuple('BareToken', "type string"), TokenInfo):
+    """A Token rendered down to a 2-tuple of (type, string)."""
+
+    @classmethod
+    def from_token(cls, tok):
+        return cls(tok.type, tok.string)
+
+    def __repr__(self):
+        tok_type = "{} ({})".format(self.type, tok_name[self.type])
+        return "{}(type={}, string={!r}".format(type(self).__name__,
+                                                tok_type, self.string)
+
+    def __eq__(self, other):
+        return super().__eq__(other[:2])
+
+    def __ne__(self, other):
+        return super().__ne__(other[:2])
+
+    def to_token(self, start=None, end=None, line=None):
+        return Token(self.type, self.string, start, end, line)
+
+
+def strip_tokens(tokens):
+    """Yield the tokens with start, end, and line set to None."""
+    for tok in tokens:
+        # XXX use clear_formatting() instead?
+        yield BareToken.from_token(tok)
+
+
+def tokenize(source, *, raw=False):
+    """Yield the tokens corresponding to the source.
+
+    If raw is True, the start, end, and line information is left intact
+    on the tokens.
+
+    """
+    # XXX should take an iterable of lines (bytes or not)
+    # see http://bugs.python.org/issue12486
+    if not isinstance(source, bytes):
+        source = source.encode('utf-8')
+    tokens = _tokenize(io.BytesIO(source).readline)
+    if not raw:
+        tokens = strip_tokens(tokens)
+    # tokens is an iterator at this point
+    return tokens
+
+
+def untokenize(tokens, *, formatter=strip_tokens):
+    """Return the source generated from the tokens.
+
+    If a formatter is passed, it is called.  A formatter is a function
+    that takes an iterable of tokens and yields them with formatting
+    changed.  The type and string of the tokens should not be changed.
+
+    The default formatter is strip_tokens(), but normalize_tokens() is
+    another useful formatter.
+
+    """
+    # XXX should yield an iterable of lines
+    if formatter is not None:
+        tokens = formatter(tokens)
+    # XXX remove this once issue16224 gets resolved
+    tokens = list(tokens)
+    # XXX tokenize.untokenize() ignores start, end, and line!
+    return _untokenize(tokens)
+
+
+#################################################
+# formatters
+
+class Formatter(metaclass=ABCMeta):
+    """The formatter API.
+
+    A formatter is simply any callable that may be used to change the
+    start, end, and line fields of Token objects.  It should not
+    modify the type or string of any of the tokens.
+
+    Formatters may be passed into untokenize().  A formatter does not
+    need to subclass Formatter.  For instance, any function may be used
+    as a formatter if it matches this API.  Thus both strip_tokens() and
+    normalize_tokens() are formatters.
+
+    """
+    # XXX expand the API to help make formatters?
+
+    @abstractmethod
+    def __call__(self, tokens):
+        """Yield the tokens back with start, end, and line modified.
+
+        The type and string of the tokens should not be modified.
+
+        """
+        raise NotImplementedError
+
+
+def format_source(source, *, formatter):
+    """Return the source with the formatter applied."""
+    tokens = tokenize(source)
+    return untokenize(tokens, formatter=formatter)
+
+
+def normalize_tokens(tokens):
+    """Yield the tokens with reasonable formatting.
+
+    The stream of tokens coming out of tokenize.tokenize() has the
+    potential of having poor whitespace placement.  This function
+    helps mitigate that effect.
+
+    """
+    raise NotImplementedError
+
+
+def normalize_source(source):
+    """Return the source with the formatting normalized."""
+    return format_source(source, normalize_tokens)