Commits

Kirill Simonov committed a6d94a9

Reviewed the scanner and the parser.

Renamed `htsql.tr.scanner` to `htsql.tr.scan` and `htsql.tr.parser`
to `htsql.tr.parse`. Added global utility functions `scan()` and `parse()`.

Removed unused module `htsql.translate`.

Added a module for translation-specific errors: `htsql.tr.error`.

Comments (0)

Files changed (12)

 
 .. automodule:: htsql.tr
    :members:
+.. automodule:: htsql.tr.error
+   :members:
 .. automodule:: htsql.tr.token
    :members:
-.. automodule:: htsql.tr.scanner
+.. automodule:: htsql.tr.scan
    :members:
 .. automodule:: htsql.tr.syntax
    :members:
-.. automodule:: htsql.tr.parser
+.. automodule:: htsql.tr.parse
    :members:
 
 :mod:`htsql.error`
 ==================
 
-This module implements HTSQL exceptions.
+This module implements generic HTSQL exceptions.
 """
 
 
 from .adapter import Utility, Realization
 from .connect import DBError, Connect, Normalize
 from .error import EngineError
-from .tr.parser import QueryParser
+from .tr.parse import parse
 from .tr.binder import Binder
 from .tr.encoder import Encoder
 from .tr.assembler import Assembler
         self.uri = uri
 
     def translate(self):
-        parser = QueryParser(self.uri)
-        syntax = parser.parse()
+        syntax = parse(self.uri)
         binder = Binder()
         binding = binder.bind_one(syntax)
         encoder = Encoder()

src/htsql/tr/error.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+:mod:`htsql.tr.error`
+=====================
+
+This module implements HTSQL translation errors.
+"""
+
+
+from ..error import BadRequestError
+
+
+class ScanError(BadRequestError):
+    """
+    Represents a scanner error.
+
+    This exception is raised when the scanner cannot tokenize a query.
+    """
+
+    kind = "scan error"
+
+
+class ParseError(BadRequestError):
+    """
+    Represents a parser error.
+
+    This exception is raised by the parser when it encounters an unexpected
+    token.
+    """
+
+    kind = "parse error"
+
+

src/htsql/tr/parse.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+:mod:`htsql.tr.parser`
+======================
+
+This module implements the HTSQL parser.
+"""
+
+
+from ..mark import Mark
+from .scan import scan
+from .token import NameToken, StringToken, NumberToken, SymbolToken, EndToken
+from .syntax import (QuerySyntax, SegmentSyntax, SelectorSyntax,
+                     SieveSyntax, OperatorSyntax, FunctionOperatorSyntax,
+                     FunctionCallSyntax, GroupSyntax, SpecifierSyntax,
+                     IdentifierSyntax, WildcardSyntax,
+                     StringSyntax, NumberSyntax)
+
+
+class Parser(object):
+    """
+    Implements an HTSQL parser.
+
+    A parser takes a stream of tokens from the HTSQL scanner and
+    produces a syntax node.
+
+    This is an abstract class; see subclasses of :class:`Parser` for
+    implementations of various parts of the HTSQL grammar.
+
+    `input` (a string)
+        The input HTSQL expression.
+    """
+
+    class __metaclass__(type):
+        # Implements a shortcut:
+        #   Parser << tokens
+        # to call
+        #   Parser.process(tokens)
+
+        def __lshift__(parser_class, tokens):
+            return parser_class.process(tokens)
+
+    def __init__(self, input):
+        assert isinstance(input, str)
+        self.input = input
+
+    def parse(self):
+        """
+        Parses the input expression; returns the corresponding syntax node.
+        """
+        # Tokenize the input query.
+        tokens = scan(self.input)
+        # Parse the input query.
+        syntax = self.process(tokens)
+        # Ensure that we reached the end of the token stream.
+        tokens.pop(EndToken)
+        return syntax
+
+    @classmethod
+    def process(cls, tokens):
+        """
+        Takes a stream of tokens; returns a syntax node.
+
+        This function does not have to exhaust the token stream.
+
+        The ``<<`` operator could be used as a synonym for :meth:`process`;
+        the following expressions are equivalent::
+
+            Parser << tokens
+            Parser.process(tokens)
+
+        `tokens` (:class:`htsql.tr.scanner.TokenStream`)
+            The stream of tokens to parse.
+        """
+        # Override in subclasses.
+        raise NotImplementedError()
+
+
+class QueryParser(Parser):
+    """
+    Parses an HTSQL query.
+
+    Here is the grammar of HTSQL::
+
+        input           ::= query END
+
+        query           ::= '/' segment?
+        segment         ::= selector | specifier selector? filter?
+        filter          ::= '?' test
+
+        element         ::= test ( '+' | '-' )*
+        test            ::= and_test ( '|' and_test )*
+        and_test        ::= implies_test ( '&' implies_test )*
+        implies_test    ::= unary_test ( '->' unary_test )?
+        unary_test      ::= '!' unary_test | comparison
+
+        comparison      ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
+                                           '$~~' | '$~' | '~~' | '~' |
+                                           '!=~~' | '!=~' | '!^~~' | '!^~' |
+                                           '!$~~' | '!$~' | '!~~' | '!~' |
+                                           '<=' | '<' | '>=' |  '>' |
+                                           '==' | '=' | '!==' | '!=' )
+                                         expression )?
+
+        expression      ::= term ( ( '+' | '-' ) term )*
+        term            ::= factor ( ( '*' | '/' | identifier ) factor )*
+        factor          ::= ( '+' | '-' ) factor | power
+        power           ::= sieve ( '^' power )?
+
+        sieve           ::= specifier selector? filter?
+        specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
+        atom            ::= '*' | selector | group | identifier call? | literal
+
+        group           ::= '(' element ')'
+        call            ::= '(' elements? ')'
+        selector        ::= '{' elements? '}'
+        elements        ::= element ( ',' element )* ','?
+
+        identifier      ::= NAME
+        literal         ::= STRING | NUMBER
+
+    Note that this grammar is almost LL(1); one notable exception is
+    the postfix ``+`` and ``-`` operators.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parse the production:
+        #   query           ::= '/' segment?
+        head_token = tokens.pop(SymbolToken, ['/'])
+        segment = None
+        if not tokens.peek(EndToken):
+            segment = SegmentParser << tokens
+        mark = Mark.union(head_token, segment)
+        query = QuerySyntax(segment, mark)
+        return query
+
+
+class SegmentParser(Parser):
+    """
+    Parses a `segment` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   segment         ::= selector | specifier selector? filter?
+        #   filter          ::= '?' test
+        base = None
+        selector = None
+        filter = None
+        if tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+        else:
+            base = SpecifierParser << tokens
+            if tokens.peek(SymbolToken, ['{']):
+                selector = SelectorParser << tokens
+            if tokens.peek(SymbolToken, ['?'], do_pop=True):
+                filter = TestParser << tokens
+        mark = Mark.union(base, selector, filter)
+        segment = SegmentSyntax(base, selector, filter, mark)
+        return segment
+
+
+class ElementParser(Parser):
+    """
+    Parses an `element` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   element         ::= test ( '+' | '-' )*
+        element = TestParser << tokens
+        while tokens.peek(SymbolToken, ['+', '-']):
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol = symbol_token.value
+            mark = Mark.union(element, symbol_token)
+            element = OperatorSyntax(symbol, element, None, mark)
+        return element
+
+
+class TestParser(Parser):
+    """
+    Parses a `test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   test            ::= and_test ( '|' and_test )*
+        test = AndTestParser << tokens
+        while tokens.peek(SymbolToken, ['|']):
+            symbol_token = tokens.pop(SymbolToken, ['|'])
+            symbol = symbol_token.value
+            left = test
+            right = AndTestParser << tokens
+            mark = Mark.union(left, right)
+            test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class AndTestParser(Parser):
+    """
+    Parses an `and_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   and_test        ::= implies_test ( '&' implies_test )*
+        test = ImpliesTestParser << tokens
+        while tokens.peek(SymbolToken, ['&']):
+            symbol_token = tokens.pop(SymbolToken, ['&'])
+            symbol = symbol_token.value
+            left = test
+            right = ImpliesTestParser << tokens
+            mark = Mark.union(left, right)
+            test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class ImpliesTestParser(Parser):
+    """
+    Parses an `implies_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   implies_test    ::= unary_test ( '->' unary_test )?
+        test = UnaryTestParser << tokens
+        if not tokens.peek(SymbolToken, ['->']):
+            return test
+        symbol_token = tokens.pop(SymbolToken, ['->'])
+        symbol = symbol_token.value
+        left = test
+        right = UnaryTestParser << tokens
+        mark = Mark.union(left, right)
+        test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class UnaryTestParser(Parser):
+    """
+    Parses a `unary_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   unary_test      ::= '!' unary_test | comparison
+        symbol_tokens = []
+        while tokens.peek(SymbolToken, ['!']):
+            symbol_token = tokens.pop(SymbolToken, ['!'])
+            symbol_tokens.append(symbol_token)
+        test = ComparisonParser << tokens
+        while symbol_tokens:
+            symbol_token = symbol_tokens.pop()
+            symbol = symbol_token.value
+            mark = Mark.union(symbol_token, test)
+            test = OperatorSyntax(symbol, None, test, mark)
+        return test
+
+
+class ComparisonParser(Parser):
+    """
+    Parses a `comparison` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   comparison  ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
+        #                                  '$~~' | '$~' | '~~' | '~' |
+        #                                  '!=~~' | '!=~' | '!^~~' | '!^~' |
+        #                                  '!$~~' | '!$~' | '!~~' | '!~' |
+        #                                  '<=' | '<' | '>=' |  '>' |
+        #                                  '==' | '=' | '!==' | '!=' )
+        #                                expression )?
+        expression = ExpressionParser << tokens
+        symbol_token = tokens.peek(SymbolToken,
+                                   ['=~~', '=~', '^~~', '^~',
+                                    '$~~', '$~', '~~', '~',
+                                    '!=~~', '!=~', '!^~~', '!^~',
+                                    '!$~~', '!$~', '!~~', '!~',
+                                    '<=', '<', '>=', '>',
+                                    '==', '=', '!==', '!='], do_pop=True)
+        if symbol_token is None:
+            return expression
+        symbol = symbol_token.value
+        left = expression
+        right = ExpressionParser << tokens
+        mark = Mark.union(left, right)
+        comparison = OperatorSyntax(symbol, left, right, mark)
+        return comparison
+
+
+class ExpressionParser(Parser):
+    """
+    Parses an `expression` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   expression      ::= term ( ( '+' | '-' ) term )*
+        expression = TermParser << tokens
+        # Here we perform a look-ahead to distinguish between productions:
+        #   element         ::= test ( '+' | '-' )*
+        # and
+        #   expression      ::= term ( ( '+' | '-' ) term )*
+        # We know that the FOLLOWS set of `element` consists of the symbols:
+        #   ',', ')', and '}',
+        # which never start the `term` non-terminal.
+        while tokens.peek(SymbolToken, ['+', '-']):
+            ahead = 1
+            while tokens.peek(SymbolToken, ['+', '-'], ahead=ahead):
+                ahead += 1
+            if tokens.peek(SymbolToken, [',', ')', '}'], ahead=ahead):
+                break
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol = symbol_token.value
+            left = expression
+            right = TermParser << tokens
+            mark = Mark.union(left, right)
+            expression = OperatorSyntax(symbol, left, right, mark)
+        return expression
+
+
+class TermParser(Parser):
+    """
+    Parses a `term` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   term            ::= factor ( ( '*' | '/' | identifier ) factor )*
+        expression = FactorParser << tokens
+        while (tokens.peek(SymbolToken, ['*', '/'])
+               or tokens.peek(NameToken)):
+            if tokens.peek(SymbolToken, ['*', '/']):
+                symbol_token = tokens.pop(SymbolToken, ['*', '/'])
+                symbol = symbol_token.value
+                left = expression
+                right = FactorParser << tokens
+                mark = Mark.union(left, right)
+                expression = OperatorSyntax(symbol, left, right, mark)
+            else:
+                identifier = IdentifierParser << tokens
+                left = expression
+                right = FactorParser << tokens
+                mark = Mark.union(left, right)
+                expression = FunctionOperatorSyntax(identifier,
+                                                    left, right, mark)
+        return expression
+
+
+class FactorParser(Parser):
+    """
+    Parses a `factor` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   factor          ::= ( '+' | '-' ) factor | power
+        symbol_tokens = []
+        while tokens.peek(SymbolToken, ['+', '-']):
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol_tokens.append(symbol_token)
+        expression = PowerParser << tokens
+        while symbol_tokens:
+            symbol_token = symbol_tokens.pop()
+            symbol = symbol_token.value
+            mark = Mark.union(symbol_token, expression)
+            expression = OperatorSyntax(symbol, None, expression, mark)
+        return expression
+
+
+class PowerParser(Parser):
+    """
+    Parses a `power` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   power           ::= sieve ( '^' power )?
+        expression = SieveParser << tokens
+        # Note that there is some grammar ambiguity here: if the sieve
+        # contains a filter part, the sieve parser will greedily consume
+        # any `^` expression.
+        if not tokens.peek(SymbolToken, ['^']):
+            return expression
+        symbol_token = tokens.pop(SymbolToken, ['^'])
+        symbol = symbol_token.value
+        left = expression
+        right = PowerParser << tokens
+        mark = Mark.union(left, right)
+        expression = OperatorSyntax(symbol, None, expression, mark)
+        return expression
+
+
+class SieveParser(Parser):
+    """
+    Parses a `sieve` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   sieve           ::= specifier selector? filter?
+        expression = SpecifierParser << tokens
+        selector = None
+        filter = None
+        if tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+        if tokens.peek(SymbolToken, ['?']):
+            tokens.pop(SymbolToken, ['?'])
+            filter = TestParser << tokens
+        if selector is None and filter is None:
+            return expression
+        mark = Mark.union(expression, selector, filter)
+        expression = SieveSyntax(expression, selector, filter, mark)
+        return expression
+
+
+class SpecifierParser(Parser):
+    """
+    Parses a `specifier` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
+        #   call            ::= '(' elements? ')'
+        #   elements        ::= element ( ',' element )* ','?
+        expression = AtomParser << tokens
+        while tokens.peek(SymbolToken, ['.']):
+            head_token = tokens.pop(SymbolToken, ['.'])
+            if tokens.peek(SymbolToken, ['*']):
+                symbol_token = tokens.pop(SymbolToken, ['*'])
+                wildcard = WildcardSyntax(symbol_token.mark)
+                mark = Mark.union(head_token, wildcard)
+                expression = SpecifierSyntax(expression, wildcard, mark)
+                break
+            else:
+                identifier = IdentifierParser << tokens
+                if tokens.peek(SymbolToken, ['(']):
+                    tokens.pop(SymbolToken, ['('])
+                    arguments = []
+                    while not tokens.peek(SymbolToken, [')']):
+                        argument = ElementParser << tokens
+                        arguments.append(argument)
+                        if not tokens.peek(SymbolToken, [')']):
+                            tokens.pop(SymbolToken, [',', ')'])
+                    tail_token = tokens.pop(SymbolToken, [')'])
+                    mark = Mark.union(head_token, tail_token)
+                    expression = FunctionCallSyntax(expression, identifier,
+                                                    arguments, mark)
+                else:
+                    mark = Mark.union(head_token, identifier)
+                    expression = SpecifierSyntax(expression, identifier, mark)
+        return expression
+
+
+class AtomParser(Parser):
+    """
+    Parses an `atom` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   atom        ::= '*' | selector | group | identifier call? | literal
+        #   call        ::= '(' elements? ')'
+        #   elements    ::= element ( ',' element )* ','?
+        #   literal     ::= STRING | NUMBER
+        if tokens.peek(SymbolToken, ['*']):
+            symbol_token = tokens.pop(SymbolToken, ['*'])
+            wildcard = WildcardSyntax(symbol_token.mark)
+            return wildcard
+        elif tokens.peek(SymbolToken, ['(']):
+            group = GroupParser << tokens
+            return group
+        elif tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+            return selector
+        elif tokens.peek(NameToken):
+            identifier = IdentifierParser << tokens
+            if tokens.peek(SymbolToken, ['(']):
+                tokens.pop(SymbolToken, ['('])
+                arguments = []
+                while not tokens.peek(SymbolToken, [')']):
+                    argument = ElementParser << tokens
+                    arguments.append(argument)
+                    if not tokens.peek(SymbolToken, [')']):
+                        tokens.pop(SymbolToken, [',', ')'])
+                tail_token = tokens.pop(SymbolToken, [')'])
+                mark = Mark.union(identifier, tail_token)
+                expression = FunctionCallSyntax(None, identifier,
+                                                arguments, mark)
+                return expression
+            else:
+                return identifier
+        elif tokens.peek(StringToken):
+            token = tokens.pop(StringToken)
+            return StringSyntax(token.value, token.mark)
+        elif tokens.peek(NumberToken):
+            token = tokens.pop(NumberToken)
+            return NumberSyntax(token.value, token.mark)
+        # We expect it to always produce an error message.
+        tokens.pop(NameToken)
+        # Not reachable.
+        assert False
+
+
+class GroupParser(Parser):
+    """
+    Parses a `group` production.
+    """
+
+    @classmethod
+    def process(self, tokens):
+        # Parses the production:
+        #   group           ::= '(' element ')'
+        head_token = tokens.pop(SymbolToken, ['('])
+        expression = ElementParser << tokens
+        tail_token = tokens.pop(SymbolToken, [')'])
+        mark = Mark.union(head_token, tail_token)
+        group = GroupSyntax(expression, mark)
+        return group
+
+
+class SelectorParser(Parser):
+    """
+    Parses a `selector` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   selector        ::= '{' elements? '}'
+        #   elements        ::= element ( ',' element )* ','?
+        head_token = tokens.pop(SymbolToken, ['{'])
+        elements = []
+        while not tokens.peek(SymbolToken, ['}']):
+            element = ElementParser << tokens
+            elements.append(element)
+            if not tokens.peek(SymbolToken, ['}']):
+                # We know it's not going to be '}', but we put it into the list
+                # of accepted values to generate a better error message.
+                tokens.pop(SymbolToken, [',', '}'])
+        tail_token = tokens.pop(SymbolToken, ['}'])
+        mark = Mark.union(head_token, tail_token)
+        selector = SelectorSyntax(elements, mark)
+        return selector
+
+
+class IdentifierParser(Parser):
+    """
+    Parser an `identifier` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   identifier      ::= NAME
+        name_token = tokens.pop(NameToken)
+        identifier = IdentifierSyntax(name_token.value, name_token.mark)
+        return identifier
+
+
+def parse(input, Parser=QueryParser):
+    """
+    Parses the input HTSQL query; returns the corresponding syntax node.
+
+    In case of syntax errors, may raise :exc:`htsql.tr.error.ScanError`
+    or :exc:`htsql.tr.error.ParseError`.
+
+    `input` (a string)
+        An HTSQL query or an HTSQL expression.
+
+    `Parser` (a subclass of :class:`Parser`)
+        The parser to use for parsing the input expression.  By default,
+        `input` is treated as a complete HTSQL query.
+    """
+    parser = Parser(input)
+    return parser.parse()
+
+

src/htsql/tr/parser.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-:mod:`htsql.tr.parser`
-======================
-
-This module implements the HTSQL parser.
-"""
-
-
-from ..mark import Mark
-from .scanner import Scanner
-from .token import NameToken, StringToken, NumberToken, SymbolToken, EndToken
-from .syntax import (QuerySyntax, SegmentSyntax, SelectorSyntax,
-                     SieveSyntax, OperatorSyntax, FunctionOperatorSyntax,
-                     FunctionCallSyntax, GroupSyntax, SpecifierSyntax,
-                     IdentifierSyntax, WildcardSyntax,
-                     StringSyntax, NumberSyntax)
-
-
-class Parser(object):
-    """
-    Implements an HTSQL parser.
-
-    A parser takes a stream of tokens from the HTSQL scanner and
-    produces a syntax node.
-
-    This is an abstract class; see subclasses of :class:`Parser` for
-    implementations of various parts of the HTSQL grammar.
-
-    `input` (a string)
-        The input HTSQL expression.
-    """
-
-    class __metaclass__(type):
-        # Implements a shortcut:
-        #   Parser << tokens
-        # to call
-        #   Parser.process(tokens)
-
-        def __lshift__(parser_class, tokens):
-            return parser_class.process(tokens)
-
-    def __init__(self, input):
-        assert isinstance(input, str)
-        self.input = input
-
-    def parse(self):
-        """
-        Parses the input expression; returns the corresponding syntax node.
-        """
-        # Tokenize the input query.
-        scanner = Scanner(self.input)
-        tokens = scanner.scan()
-        # Parse the input query.
-        syntax = self.process(tokens)
-        # Ensure that we reached the end of the token stream.
-        tokens.pop(EndToken)
-        return syntax
-
-    @classmethod
-    def process(cls, tokens):
-        """
-        Takes a stream of tokens; returns a syntax node.
-
-        This function does not have to exhaust the token stream.
-
-        The ``<<`` operator could be used as a synonym for :meth:`process`;
-        the following expressions are equivalent::
-
-            Parser << tokens
-            Parser.process(tokens)
-
-        `tokens` (:class:`htsql.tr.scanner.TokenStream`)
-            The stream of tokens to parse.
-        """
-        # Override in subclasses.
-        raise NotImplementedError()
-
-
-class QueryParser(Parser):
-    """
-    Parses an HTSQL query.
-
-    Here is the grammar of HTSQL::
-
-        input           ::= query END
-
-        query           ::= '/' segment?
-        segment         ::= selector | specifier selector? filter?
-        filter          ::= '?' test
-
-        element         ::= test ( '+' | '-' )*
-        test            ::= and_test ( '|' and_test )*
-        and_test        ::= implies_test ( '&' implies_test )*
-        implies_test    ::= unary_test ( '->' unary_test )?
-        unary_test      ::= '!' unary_test | comparison
-
-        comparison      ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
-                                           '$~~' | '$~' | '~~' | '~' |
-                                           '!=~~' | '!=~' | '!^~~' | '!^~' |
-                                           '!$~~' | '!$~' | '!~~' | '!~' |
-                                           '<=' | '<' | '>=' |  '>' |
-                                           '==' | '=' | '!==' | '!=' )
-                                         expression )?
-
-        expression      ::= term ( ( '+' | '-' ) term )*
-        term            ::= factor ( ( '*' | '/' | identifier ) factor )*
-        factor          ::= ( '+' | '-' ) factor | power
-        power           ::= sieve ( '^' power )?
-
-        sieve           ::= specifier selector? filter?
-        specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
-        atom            ::= '*' | selector | group | identifier call? | literal
-
-        group           ::= '(' element ')'
-        call            ::= '(' elements? ')'
-        selector        ::= '{' elements? '}'
-        elements        ::= element ( ',' element )* ','?
-
-        identifier      ::= NAME
-        literal         ::= STRING | NUMBER
-
-    Note that this grammar is almost LL(1); one notable exception is
-    the postfix ``+`` and ``-`` operators.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parse the production:
-        #   query           ::= '/' segment?
-        head_token = tokens.pop(SymbolToken, ['/'])
-        segment = None
-        if not tokens.peek(EndToken):
-            segment = SegmentParser << tokens
-        mark = Mark.union(head_token, segment)
-        query = QuerySyntax(segment, mark)
-        return query
-
-
-class SegmentParser(Parser):
-    """
-    Parses a `segment` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   segment         ::= selector | specifier selector? filter?
-        #   filter          ::= '?' test
-        base = None
-        selector = None
-        filter = None
-        if tokens.peek(SymbolToken, ['{']):
-            selector = SelectorParser << tokens
-        else:
-            base = SpecifierParser << tokens
-            if tokens.peek(SymbolToken, ['{']):
-                selector = SelectorParser << tokens
-            if tokens.peek(SymbolToken, ['?'], do_pop=True):
-                filter = TestParser << tokens
-        mark = Mark.union(base, selector, filter)
-        segment = SegmentSyntax(base, selector, filter, mark)
-        return segment
-
-
-class ElementParser(Parser):
-    """
-    Parses an `element` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   element         ::= test ( '+' | '-' )*
-        element = TestParser << tokens
-        while tokens.peek(SymbolToken, ['+', '-']):
-            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
-            symbol = symbol_token.value
-            mark = Mark.union(element, symbol_token)
-            element = OperatorSyntax(symbol, element, None, mark)
-        return element
-
-
-class TestParser(Parser):
-    """
-    Parses a `test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   test            ::= and_test ( '|' and_test )*
-        test = AndTestParser << tokens
-        while tokens.peek(SymbolToken, ['|']):
-            symbol_token = tokens.pop(SymbolToken, ['|'])
-            symbol = symbol_token.value
-            left = test
-            right = AndTestParser << tokens
-            mark = Mark.union(left, right)
-            test = OperatorSyntax(symbol, left, right, mark)
-        return test
-
-
-class AndTestParser(Parser):
-    """
-    Parses an `and_test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   and_test        ::= implies_test ( '&' implies_test )*
-        test = ImpliesTestParser << tokens
-        while tokens.peek(SymbolToken, ['&']):
-            symbol_token = tokens.pop(SymbolToken, ['&'])
-            symbol = symbol_token.value
-            left = test
-            right = ImpliesTestParser << tokens
-            mark = Mark.union(left, right)
-            test = OperatorSyntax(symbol, left, right, mark)
-        return test
-
-
-class ImpliesTestParser(Parser):
-    """
-    Parses an `implies_test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   implies_test    ::= unary_test ( '->' unary_test )?
-        test = UnaryTestParser << tokens
-        if not tokens.peek(SymbolToken, ['->']):
-            return test
-        symbol_token = tokens.pop(SymbolToken, ['->'])
-        symbol = symbol_token.value
-        left = test
-        right = UnaryTestParser << tokens
-        mark = Mark.union(left, right)
-        test = OperatorSyntax(symbol, left, right, mark)
-        return test
-
-
-class UnaryTestParser(Parser):
-    """
-    Parses a `unary_test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   unary_test      ::= '!' unary_test | comparison
-        symbol_tokens = []
-        while tokens.peek(SymbolToken, ['!']):
-            symbol_token = tokens.pop(SymbolToken, ['!'])
-            symbol_tokens.append(symbol_token)
-        test = ComparisonParser << tokens
-        while symbol_tokens:
-            symbol_token = symbol_tokens.pop()
-            symbol = symbol_token.value
-            mark = Mark.union(symbol_token, test)
-            test = OperatorSyntax(symbol, None, test, mark)
-        return test
-
-
-class ComparisonParser(Parser):
-    """
-    Parses a `comparison` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   comparison  ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
-        #                                  '$~~' | '$~' | '~~' | '~' |
-        #                                  '!=~~' | '!=~' | '!^~~' | '!^~' |
-        #                                  '!$~~' | '!$~' | '!~~' | '!~' |
-        #                                  '<=' | '<' | '>=' |  '>' |
-        #                                  '==' | '=' | '!==' | '!=' )
-        #                                expression )?
-        expression = ExpressionParser << tokens
-        symbol_token = tokens.peek(SymbolToken,
-                                   ['=~~', '=~', '^~~', '^~',
-                                    '$~~', '$~', '~~', '~',
-                                    '!=~~', '!=~', '!^~~', '!^~',
-                                    '!$~~', '!$~', '!~~', '!~',
-                                    '<=', '<', '>=', '>',
-                                    '==', '=', '!==', '!='], do_pop=True)
-        if symbol_token is None:
-            return expression
-        symbol = symbol_token.value
-        left = expression
-        right = ExpressionParser << tokens
-        mark = Mark.union(left, right)
-        comparison = OperatorSyntax(symbol, left, right, mark)
-        return comparison
-
-
-class ExpressionParser(Parser):
-    """
-    Parses an `expression` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   expression      ::= term ( ( '+' | '-' ) term )*
-        expression = TermParser << tokens
-        # Here we perform a look-ahead to distinguish between productions:
-        #   element         ::= test ( '+' | '-' )*
-        # and
-        #   expression      ::= term ( ( '+' | '-' ) term )*
-        # We know that the FOLLOWS set of `element` consists of the symbols:
-        #   ',', ')', and '}',
-        # which never start the `term` non-terminal.
-        while tokens.peek(SymbolToken, ['+', '-']):
-            ahead = 1
-            while tokens.peek(SymbolToken, ['+', '-'], ahead=ahead):
-                ahead += 1
-            if tokens.peek(SymbolToken, [',', ')', '}'], ahead=ahead):
-                break
-            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
-            symbol = symbol_token.value
-            left = expression
-            right = TermParser << tokens
-            mark = Mark.union(left, right)
-            expression = OperatorSyntax(symbol, left, right, mark)
-        return expression
-
-
-class TermParser(Parser):
-    """
-    Parses a `term` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   term            ::= factor ( ( '*' | '/' | identifier ) factor )*
-        expression = FactorParser << tokens
-        while (tokens.peek(SymbolToken, ['*', '/'])
-               or tokens.peek(NameToken)):
-            if tokens.peek(SymbolToken, ['*', '/']):
-                symbol_token = tokens.pop(SymbolToken, ['*', '/'])
-                symbol = symbol_token.value
-                left = expression
-                right = FactorParser << tokens
-                mark = Mark.union(left, right)
-                expression = OperatorSyntax(symbol, left, right, mark)
-            else:
-                identifier = IdentifierParser << tokens
-                left = expression
-                right = FactorParser << tokens
-                mark = Mark.union(left, right)
-                expression = FunctionOperatorSyntax(identifier,
-                                                    left, right, mark)
-        return expression
-
-
-class FactorParser(Parser):
-    """
-    Parses a `factor` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   factor          ::= ( '+' | '-' ) factor | power
-        symbol_tokens = []
-        while tokens.peek(SymbolToken, ['+', '-']):
-            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
-            symbol_tokens.append(symbol_token)
-        expression = PowerParser << tokens
-        while symbol_tokens:
-            symbol_token = symbol_tokens.pop()
-            symbol = symbol_token.value
-            mark = Mark.union(symbol_token, expression)
-            expression = OperatorSyntax(symbol, None, expression, mark)
-        return expression
-
-
-class PowerParser(Parser):
-    """
-    Parses a `power` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   power           ::= sieve ( '^' power )?
-        expression = SieveParser << tokens
-        # Note that there is some grammar ambiguity here: if the sieve
-        # contains a filter part, the sieve parser will greedily consume
-        # any `^` expression.
-        if not tokens.peek(SymbolToken, ['^']):
-            return expression
-        symbol_token = tokens.pop(SymbolToken, ['^'])
-        symbol = symbol_token.value
-        left = expression
-        right = PowerParser << tokens
-        mark = Mark.union(left, right)
-        expression = OperatorSyntax(symbol, None, expression, mark)
-        return expression
-
-
-class SieveParser(Parser):
-    """
-    Parses a `sieve` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   sieve           ::= specifier selector? filter?
-        expression = SpecifierParser << tokens
-        selector = None
-        filter = None
-        if tokens.peek(SymbolToken, ['{']):
-            selector = SelectorParser << tokens
-        if tokens.peek(SymbolToken, ['?']):
-            tokens.pop(SymbolToken, ['?'])
-            filter = TestParser << tokens
-        if selector is None and filter is None:
-            return expression
-        mark = Mark.union(expression, selector, filter)
-        expression = SieveSyntax(expression, selector, filter, mark)
-        return expression
-
-
-class SpecifierParser(Parser):
-    """
-    Parses a `specifier` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
-        #   call            ::= '(' elements? ')'
-        #   elements        ::= element ( ',' element )* ','?
-        expression = AtomParser << tokens
-        while tokens.peek(SymbolToken, ['.']):
-            head_token = tokens.pop(SymbolToken, ['.'])
-            if tokens.peek(SymbolToken, ['*']):
-                symbol_token = tokens.pop(SymbolToken, ['*'])
-                wildcard = WildcardSyntax(symbol_token.mark)
-                mark = Mark.union(head_token, wildcard)
-                expression = SpecifierSyntax(expression, wildcard, mark)
-                break
-            else:
-                identifier = IdentifierParser << tokens
-                if tokens.peek(SymbolToken, ['(']):
-                    tokens.pop(SymbolToken, ['('])
-                    arguments = []
-                    while not tokens.peek(SymbolToken, [')']):
-                        argument = ElementParser << tokens
-                        arguments.append(argument)
-                        if not tokens.peek(SymbolToken, [')']):
-                            tokens.pop(SymbolToken, [',', ')'])
-                    tail_token = tokens.pop(SymbolToken, [')'])
-                    mark = Mark.union(head_token, tail_token)
-                    expression = FunctionCallSyntax(expression, identifier,
-                                                    arguments, mark)
-                else:
-                    mark = Mark.union(head_token, identifier)
-                    expression = SpecifierSyntax(expression, identifier, mark)
-        return expression
-
-
-class AtomParser(Parser):
-    """
-    Parses an `atom` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   atom        ::= '*' | selector | group | identifier call? | literal
-        #   call        ::= '(' elements? ')'
-        #   elements    ::= element ( ',' element )* ','?
-        #   literal     ::= STRING | NUMBER
-        if tokens.peek(SymbolToken, ['*']):
-            symbol_token = tokens.pop(SymbolToken, ['*'])
-            wildcard = WildcardSyntax(symbol_token.mark)
-            return wildcard
-        elif tokens.peek(SymbolToken, ['(']):
-            group = GroupParser << tokens
-            return group
-        elif tokens.peek(SymbolToken, ['{']):
-            selector = SelectorParser << tokens
-            return selector
-        elif tokens.peek(NameToken):
-            identifier = IdentifierParser << tokens
-            if tokens.peek(SymbolToken, ['(']):
-                tokens.pop(SymbolToken, ['('])
-                arguments = []
-                while not tokens.peek(SymbolToken, [')']):
-                    argument = ElementParser << tokens
-                    arguments.append(argument)
-                    if not tokens.peek(SymbolToken, [')']):
-                        tokens.pop(SymbolToken, [',', ')'])
-                tail_token = tokens.pop(SymbolToken, [')'])
-                mark = Mark.union(identifier, tail_token)
-                expression = FunctionCallSyntax(None, identifier,
-                                                arguments, mark)
-                return expression
-            else:
-                return identifier
-        elif tokens.peek(StringToken):
-            token = tokens.pop(StringToken)
-            return StringSyntax(token.value, token.mark)
-        elif tokens.peek(NumberToken):
-            token = tokens.pop(NumberToken)
-            return NumberSyntax(token.value, token.mark)
-        # We expect it to always produce an error message.
-        tokens.pop(NameToken)
-        # Not reachable.
-        assert False
-
-
-class GroupParser(Parser):
-    """
-    Parses a `group` production.
-    """
-
-    @classmethod
-    def process(self, tokens):
-        # Parses the production:
-        #   group           ::= '(' element ')'
-        head_token = tokens.pop(SymbolToken, ['('])
-        expression = ElementParser << tokens
-        tail_token = tokens.pop(SymbolToken, [')'])
-        mark = Mark.union(head_token, tail_token)
-        group = GroupSyntax(expression, mark)
-        return group
-
-
-class SelectorParser(Parser):
-    """
-    Parses a `selector` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   selector        ::= '{' elements? '}'
-        #   elements        ::= element ( ',' element )* ','?
-        head_token = tokens.pop(SymbolToken, ['{'])
-        elements = []
-        while not tokens.peek(SymbolToken, ['}']):
-            element = ElementParser << tokens
-            elements.append(element)
-            if not tokens.peek(SymbolToken, ['}']):
-                # We know it's not going to be '}', but we put it into the list
-                # of accepted values to generate a better error message.
-                tokens.pop(SymbolToken, [',', '}'])
-        tail_token = tokens.pop(SymbolToken, ['}'])
-        mark = Mark.union(head_token, tail_token)
-        selector = SelectorSyntax(elements, mark)
-        return selector
-
-
-class IdentifierParser(Parser):
-    """
-    Parser an `identifier` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   identifier      ::= NAME
-        name_token = tokens.pop(NameToken)
-        identifier = IdentifierSyntax(name_token.value, name_token.mark)
-        return identifier
-
-
+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+:mod:`htsql.tr.scanner`
+=======================
+
+This module implements the HTSQL scanner.
+"""
+
+
+from .token import (Token, SpaceToken, NameToken, StringToken, NumberToken,
+                    SymbolToken, EndToken)
+from .error import ScanError, ParseError
+from ..mark import Mark
+from ..util import maybe, listof
+import re
+
+
+class TokenStream(object):
+    """
+    A sequence of tokens.
+
+    :class:`TokenStream` wraps a list of tokens with a convenient interface
+    for consumption and look-ahead.
+
+    `tokens` (a list of :`class:htsql.tr.token.Token` objects)
+        A list of tokens.
+    """
+
+    def __init__(self, tokens):
+        assert isinstance(tokens, listof(Token))
+
+        # The list of tokens.
+        self.tokens = tokens
+        # The position of the active token.
+        self.idx = 0
+
+    def peek(self, token_class=None, values=None, ahead=0,
+             do_pop=False, do_force=False):
+        """
+        Returns the active token.
+
+        If the parameter `ahead` is non-zero, the method will return
+        the next `ahead` token after the active one.
+
+        When `token_class` is set, the method checks if the token is an
+        instance of the given class.  When `values` is set, the method
+        checks if the token value belongs to the given list of values.
+        If any of the checks fail, the method either raises
+        :exc:`htsql.error.tr.ParseError` or returns ``None`` depending
+        on the value of the `do_force` parameter.
+
+        This method advances the active pointer to the next token if
+        `do_pop` is enabled.
+
+        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
+            If not ``None``, the method checks that the returned token
+            is an instance of `token_class`.
+
+        `values` (a list of strings or ``None``)
+            If not ``None``, the method checks that the value of the
+            returned token belongs to the list.
+
+        `ahead` (an integer)
+            The position of the returned token relative to the active one.
+
+        `do_pop` (Boolean)
+            If set, the method will advance the active pointer.
+
+        `do_force` (Boolean)
+            This flag affects the method behavior when any of the token
+            checks fail.  If set, the method will raise
+            :exc:`htsql.error.tr.ParseError`; otherwise it will return
+            ``None``.
+        """
+        # Sanity check on the arguments.
+        assert token_class is None or issubclass(token_class, Token)
+        assert isinstance(values, maybe(listof(str)))
+        assert token_class is not None or values is None
+        assert isinstance(ahead, int) and ahead >= 0
+        assert self.idx+ahead < len(self.tokens)
+        assert isinstance(do_pop, bool)
+        assert isinstance(do_force, bool)
+
+        # Get the token we are going to return.  It is the responsibility
+        # of the caller to ensure that the index is in the list bounds.
+        token = self.tokens[self.idx+ahead]
+
+        # Indicates if the token passed the given tests.
+        is_expected = True
+        if token_class is not None:
+            # Check the token type.
+            if not isinstance(token, token_class):
+                is_expected = False
+            else:
+                if values is not None:
+                    # Check the token value.
+                    if token.value not in values:
+                        is_expected = False
+
+        # The token failed the checks.
+        if not is_expected:
+            # If `do_force` is not set, return `None`; otherwise generate
+            # an error message of the form:
+            #   expected {token_class.NAME} ('{values[0]}', ...);
+            #   got {token.NAME} '{token.value}'
+            if not do_force:
+                return None
+            expected = "%s" % token_class.name.upper()
+            if values:
+                if len(values) == 1:
+                    expected = "%s %r" % (expected, values[0])
+                else:
+                    expected = "%s (%s)" % (expected,
+                                            ", ".join(repr(value)
+                                                      for value in values))
+            got = "%s %r" % (token.name.upper(), token.value)
+            raise ParseError("expected %s; got %s" % (expected, got),
+                             token.mark)
+        # Advance the pointer.
+        if do_pop:
+            self.idx += ahead+1
+
+        return token
+
+    def pop(self, token_class=None, values=None):
+        """
+        Returns the active token and advances the pointer to the next token.
+
+        When `token_class` is set, the method checks if the token is an
+        instance of the given class.  When `values` is set, the method
+        checks if the token value belongs to the given list of values.
+        If any of the checks fail, the method raises
+        :exc:`htsql.error.tr.ParseError`.
+
+        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
+            If not ``None``, the method checks that the active token
+            is an instance of `token_class`.
+
+        `values` (a list of strings or ``None``)
+            If not ``None``, the method checks that the value of the active
+            token belongs to the list.
+        """
+        return self.peek(token_class, values,
+                         do_pop=True, do_force=True)
+
+
+class Scanner(object):
+    """
+    Implements the HTSQL scanner.
+
+    The scanner tokenizes the input query and produces a stream of tokens.
+
+    The first step of scanning is decoding ``%``-escape sequences.  Then
+    the scanner splits the input query to a list of tokens.  The following
+    token types are emitted:
+
+    *NAME*
+        Represents an HTSQL identifier: a sequence of alphanumeric characters
+        that does not start with a digit.  Alphanumeric characters include
+        characters ``a``-``z``, ``A``-``Z``, ``0``-``9``, ``_``, and those
+        Unicode characters that are classified as alphanumeric.
+
+    *NUMBER*
+        Represents a number literal: integer, float and scientific notations
+        are recognized.
+
+    *STRING*
+        Represents a string literal enclosed in single quotes; any single
+        quote character should be doubled.
+
+    *SYMBOL*
+        Represents a valid symbol in HTSQL grammar; one of the following
+        symbols:
+
+        *comparison operators*
+            ``=``, ``!=``, ``==``, ``!==``, ``~``, ``!~``,
+            ``~~``, ``!~~``, ``^~``, ``!^~``, ``^~~``, ``!^~~``,
+            ``$~``, ``!$~``, ``$~~``, ``!$~~``, ``=~``, ``!=~``,
+            ``=~~``, ``!=~~``. ``<``, ``<=``, ``>``, ``>=``.
+
+        *logical operators*
+            ``!``, ``&``, ``|``, ``->``, ``?``.
+
+        *arithmetic operators*
+            ``+``, ``-``, ``*``, ``^``.
+
+        *punctuation*
+            ``(``, ``)``, ``[``, ``]``, ``{``, ``}``,
+            ``.``, ``,``, ``/``.
+
+    There are also two special token types:
+
+    *WHITESPACE*
+        Represents a whitespace character, one of: space, tab, LF, CR, FF, VT,
+        and those Unicode characters that are classified as space.  The
+        whitespace token are immediately discarded by the scanner and never
+        emitted.
+
+    *END*
+        Indicates the end of the query; it is always the last token emitted.
+
+    The :class:`Scanner` constructor takes the following argument:
+
+    `input`
+        An HTSQL query.
+    """
+
+    # List of tokens generated by the scanner.
+    tokens = [
+            SpaceToken,
+            NameToken,
+            NumberToken,
+            StringToken,
+            SymbolToken,
+            EndToken,
+    ]
+    # A regular expression that matches an HTSQL token.  The expression
+    # is compiled from individual token patterns on the first instantiation
+    # of the scanner.
+    regexp = None
+
+    # The regular expression to match %-escape sequences.
+    escape_pattern = r"""%(?P<code>[0-9A-Fa-f]{2})?"""
+    escape_regexp = re.compile(escape_pattern)
+
+    def __init__(self, input):
+        assert isinstance(input, str)
+
+        self.input = input
+        self.init_regexp()
+
+    @classmethod
+    def init_regexp(cls):
+        # Compile the regular expression that matches all token types.
+
+        # If it is already done, return.
+        if cls.regexp is not None:
+            return
+
+        # Get patterns for each token type grouped by the class name.
+        token_patterns = []
+        for token_class in cls.tokens:
+            pattern = r"(?P<%s> %s)" % (token_class.name, token_class.pattern)
+            token_patterns.append(pattern)
+
+        # Combine the individual patterns and compile the expression.
+        pattern = r"|".join(token_patterns)
+        cls.regexp = re.compile(pattern, re.X|re.U)
+
+    def unquote(self, data):
+        # Decode %-escape sequences.
+
+        def replace(match):
+            # Two hexdecimal digits that encode a byte value.
+            code = match.group('code')
+            # Complain if we get `%` not followed by two hexdecimal digits.
+            if not code:
+                mark = Mark(match.string, match.begin(), match.end())
+                raise ScanError("invalid escape sequence", mark)
+            # Return the character corresponding to the escape sequence.
+            return chr(int(code, 16))
+
+        # Apply `replace` to all `%`-escape sequences and return the result.
+        data = self.escape_regexp.sub(replace, data)
+        return data
+
+    def scan(self):
+        """
+        Tokenizes the query; returns a :class:`TokenStream` instance.
+
+        In case of syntax errors, raises :exc:`htsql.error.tr.ScanError`.
+        """
+        # Decode %-escape sequences.
+        unquoted_input = self.unquote(self.input)
+        # Since we want the `\w` pattern to match Unicode characters
+        # classified as alphanumeric, we have to convert the input query
+        # to Unicode.
+        try:
+            decoded_input = unquoted_input.decode('utf-8')
+        except UnicodeDecodeError, exc:
+            mark = Mark(unquoted_input, exc.start, exc.end)
+            raise ScanError("cannot decode an UTF-8 character: %s"
+                            % exc.reason, mark)
+
+        # The beginning of the next token.
+        start = 0
+        # The beginning of the mark slice.  Note that both `start` and
+        # `mark_start` point to the same position in the query string;
+        # however `start` is measured in Unicode characters while
+        # `mark_start` is measured in octets.
+        mark_start = 0
+        # Have we reached the final token?
+        is_end = False
+        # The list of generated tokens.
+        tokens = []
+
+        # Loop till we get the final token.
+        while not is_end:
+            # Match the next token.
+            match = self.regexp.match(decoded_input, start)
+            if match is None:
+                mark = Mark(unquoted_input, mark_start, mark_start+1)
+                raise ScanError("unexpected character %r"
+                                % decoded_input[start].encode('utf-8'), mark)
+
+            # Find the token class that matched the token.
+            for token_class in self.tokens:
+                group = match.group(token_class.name)
+                if group is not None:
+                    break
+            else:
+                # Unreachable.
+                assert False
+
+            # The raw string that matched the token pattern.
+            group = group.encode('utf-8')
+            # Unquote the token value.
+            value = token_class.unquote(group)
+            # The end of the mark slice.
+            mark_end = mark_start+len(group)
+
+            # Skip whitespace tokens; otherwise produce the next token.
+            if not token_class.is_ws:
+                mark = Mark(unquoted_input, mark_start, mark_end)
+                token = token_class(value, mark)
+                tokens.append(token)
+
+            # Check if we reached the final token.
+            if token_class.is_end:
+                is_end = True
+
+            # Advance the pointers to the beginning of the next token.
+            start = match.end()
+            mark_start = mark_end
+
+        return TokenStream(tokens)
+
+
+def scan(input):
+    """
+    Tokenizes the input HTSQL query or expression.
+    
+    Returns a stream of tokens (a :class:`TokenStream` instance).
+
+    In case of syntax errors, raises :exc:`htsql.error.tr.ScanError`.
+    """
+    scanner = Scanner(input)
+    return scanner.scan()
+
+

src/htsql/tr/scanner.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-:mod:`htsql.tr.scanner`
-=======================
-
-This module implements the HTSQL scanner.
-"""
-
-
-from .token import (Token, SpaceToken, NameToken, StringToken, NumberToken,
-                    SymbolToken, EndToken)
-from ..mark import Mark
-from ..error import InvalidSyntaxError
-from ..util import maybe, listof
-import re
-
-
-class TokenStream(object):
-    """
-    A sequence of tokens.
-
-    :class:`TokenStream` wraps a list of tokens with a convenient interface
-    for consumption and look-ahead.
-
-    `tokens` (a list of :`class:htsql.token.Token` objects)
-        A list of tokens.
-    """
-
-    def __init__(self, tokens):
-        assert isinstance(tokens, listof(Token))
-
-        # The list of tokens.
-        self.tokens = tokens
-        # The position of the active token.
-        self.idx = 0
-
-    def peek(self, token_class=None, values=None, ahead=0,
-             do_pop=False, do_force=False):
-        """
-        Returns the active token.
-
-        If the parameter `ahead` is non-zero, the method will return
-        the next `ahead` token after the active one.
-
-        When `token_class` is set, the method checks if the token is an
-        instance of the given class.  When `values` is set, the method
-        checks if the token value belongs to the given list of values.
-        If any of the checks fail, the method either raises
-        :exc:`htsql.error.InvaludSyntaxError` or
-        returns ``None`` depending on the value of the `do_force`
-        parameter.
-
-        This method advances the active pointer to the next token if
-        `do_pop` is enabled.
-
-        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
-            If not ``None``, the method checks that the returned token
-            is an instance of `token_class`.
-
-        `values` (a list of strings or ``None``)
-            If not ``None``, the method checks that the value of the
-            returned token belongs to the list.
-
-        `ahead` (an integer)
-            The position of the returned token relative to the active one.
-
-        `do_pop` (Boolean)
-            If set, the method will advance the active pointer.
-
-        `do_force` (Boolean)
-            This flag affects the method behavior when any of the token
-            checks fail.  If set, the method will raise
-            :exc:`htsql.error.InvaludSyntaxError`; otherwise it will return
-            ``None``.
-        """
-        # Sanity check on the arguments.
-        assert token_class is None or issubclass(token_class, Token)
-        assert isinstance(values, maybe(listof(str)))
-        assert token_class is not None or values is None
-        assert isinstance(ahead, int) and ahead >= 0
-        assert self.idx+ahead < len(self.tokens)
-        assert isinstance(do_pop, bool)
-        assert isinstance(do_force, bool)
-
-        # Get the token we are going to return.  It is the responsibility
-        # of the caller to ensure that the index is in the list bounds.
-        token = self.tokens[self.idx+ahead]
-
-        # Indicates if the token passed the given tests.
-        is_expected = True
-        if token_class is not None:
-            # Check the token type.
-            if not isinstance(token, token_class):
-                is_expected = False
-            else:
-                if values is not None:
-                    # Check the token value.
-                    if token.value not in values:
-                        is_expected = False
-
-        # The token failed the checks.
-        if not is_expected:
-            # If `do_force` is not set, return `None`; otherwise generate
-            # an error message of the form:
-            #   expected {token_class.NAME} ('{values[0]}', ...);
-            #   got {token.NAME} '{token.value}'
-            if not do_force:
-                return None
-            expected = "%s" % token_class.name.upper()
-            if values:
-                if len(values) == 1:
-                    expected = "%s %r" % (expected, values[0])
-                else:
-                    expected = "%s (%s)" % (expected,
-                                            ", ".join(repr(value)
-                                                      for value in values))
-            got = "%s %r" % (token.name.upper(), token.value)
-            raise InvalidSyntaxError("expected %s; got %s" % (expected, got),
-                                     token.mark)
-        # Advance the pointer.
-        if do_pop:
-            self.idx += ahead+1
-
-        return token
-
-    def pop(self, token_class=None, values=None):
-        """
-        Returns the active token and advances the pointer to the next token.
-
-        When `token_class` is set, the method checks if the token is an
-        instance of the given class.  When `values` is set, the method
-        checks if the token value belongs to the given list of values.
-        If any of the checks fail, the method raises
-        :exc:`htsql.error.InvaludSyntaxError`.
-
-        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
-            If not ``None``, the method checks that the active token
-            is an instance of `token_class`.
-
-        `values` (a list of strings or ``None``)
-            If not ``None``, the method checks that the value of the active
-            token belongs to the list.
-        """
-        return self.peek(token_class, values,
-                         do_pop=True, do_force=True)
-
-
-class Scanner(object):
-    """
-    Implements the HTSQL scanner.
-
-    The scanner tokenizes the input query and produces a stream of tokens.
-
-    The first step of scanning is decoding ``%``-escape sequences.  Then
-    the scanner splits the input query to a list of tokens.  The following
-    token types are emitted:
-
-    *NAME*
-        Represents an HTSQL identifier: a sequence of alphanumeric characters
-        that does not start with a digit.  Alphanumeric characters include
-        characters ``a``-``z``, ``A``-``Z``, ``0``-``9``, ``_``, and those
-        Unicode characters that are classified as alphanumeric.
-
-    *NUMBER*
-        Represents a number literal: integer, float and scientific notations
-        are recognized.
-
-    *STRING*
-        Represents a string literal enclosed in single quotes; any single
-        quote character should be doubled.
-
-    *SYMBOL*
-        Represents a valid symbol in HTSQL grammar; one of the following
-        symbols:
-
-        *comparison operators*
-            ``=``, ``!=``, ``==``, ``!==``, ``~``, ``!~``,
-            ``~~``, ``!~~``, ``^~``, ``!^~``, ``^~~``, ``!^~~``,
-            ``$~``, ``!$~``, ``$~~``, ``!$~~``, ``=~``, ``!=~``,
-            ``=~~``, ``!=~~``. ``<``, ``<=``, ``>``, ``>=``.
-
-        *logical operators*
-            ``!``, ``&``, ``|``, ``->``, ``?``.
-
-        *arithmetic operators*
-            ``+``, ``-``, ``*``, ``^``.
-
-        *punctuation*
-            ``(``, ``)``, ``[``, ``]``, ``{``, ``}``,
-            ``.``, ``,``, ``/``.
-
-    There are also two special token types:
-
-    *WHITESPACE*
-        Represents a whitespace character, one of: space, tab, LF, CR, FF, VT,
-        and those Unicode characters that are classified as space.  The
-        whitespace token are immediately discarded by the scanner and never
-        emitted.
-
-    *END*
-        Indicates the end of the query; it is always the last token emitted.
-
-    The :class:`Scanner` constructor takes the following argument:
-
-    `input`
-        An HTSQL query.
-    """
-
-    # List of tokens generated by the scanner.
-    tokens = [
-            SpaceToken,
-            NameToken,
-            NumberToken,
-            StringToken,
-            SymbolToken,
-            EndToken,
-    ]
-    # A regular expression that matches an HTSQL token.  The expression
-    # is compiled from individual token patterns on the first instantiation
-    # of the scanner.
-    regexp = None
-
-    # The regular expression to match %-escape sequences.
-    escape_pattern = r"""%(?P<code>[0-9A-Fa-f]{2})?"""
-    escape_regexp = re.compile(escape_pattern)
-
-    def __init__(self, input):
-        assert isinstance(input, str)
-
-        self.input = input
-        self.init_regexp()
-
-    @classmethod
-    def init_regexp(cls):
-        # Compile the regular expression that matches all token types.
-
-        # If it is already done, return.
-        if cls.regexp is not None:
-            return
-
-        # Get patterns for each token type grouped by the class name.
-        token_patterns = []
-        for token_class in cls.tokens:
-            pattern = r"(?P<%s> %s)" % (token_class.name, token_class.pattern)
-            token_patterns.append(pattern)
-
-        # Combine the individual patterns and compile the expression.
-        pattern = r"|".join(token_patterns)
-        cls.regexp = re.compile(pattern, re.X|re.U)
-
-    def unquote(self, data):
-        # Decode %-escape sequences.
-
-        def replace(match):
-            # Two hexdecimal digits that encode a byte value.
-            code = match.group('code')
-            # Complain if we get `%` not followed by two hexdecimal digits.
-            if not code:
-                mark = Mark(match.string, match.begin(), match.end())
-                raise InvalidSyntaxError("invalid escape sequence", mark)
-            # Return the character corresponding to the escape sequence.
-            return chr(int(code, 16))
-
-        # Apply `replace` to all `%`-escape sequences and return the result.
-        data = self.escape_regexp.sub(replace, data)
-        return data
-
-    def scan(self):
-        """
-        Tokenizes the query; returns a :class:`TokenStream` instance.
-
-        In case of syntax errors, raises :exc:`htsql.error.InvalidSyntaxError`.
-        """
-        # Decode %-escape sequences.
-        unquoted_input = self.unquote(self.input)
-        # Since we want the `\w` pattern to match Unicode characters
-        # classified as alphanumeric, we have to convert the input query
-        # to Unicode.
-        try:
-            decoded_input = unquoted_input.decode('utf-8')
-        except UnicodeDecodeError, exc:
-            mark = Mark(unquoted_input, exc.start, exc.end)
-            raise InvalidSyntaxError("cannot decode an UTF-8 character: %s"
-                                     % exc.reason, mark)
-
-        # The beginning of the next token.
-        start = 0
-        # The beginning of the mark slice.  Note that both `start` and
-        # `mark_start` point to the same position in the query string;
-        # however `start` is measured in Unicode characters while
-        # `mark_start` is measured in octets.
-        mark_start = 0
-        # Have we reached the final token?
-        is_end = False
-        # The list of generated tokens.
-        tokens = []
-
-        # Loop till we get the final token.
-        while not is_end:
-            # Match the next token.
-            match = self.regexp.match(decoded_input, start)
-            if match is None:
-                mark = Mark(unquoted_input, mark_start, mark_start+1)
-                raise InvalidSyntaxError("unexpected character %r" %
-                                         decoded_input[start].encode('utf-8'),
-                                         mark)
-
-            # Find the token class that matched the token.
-            for token_class in self.tokens:
-                group = match.group(token_class.name)
-                if group is not None:
-                    break
-            else:
-                # Unreachable.
-                assert False
-
-            # The raw string that matched the token pattern.
-            group = group.encode('utf-8')
-            # Unquote the token value.
-            value = token_class.unquote(group)
-            # The end of the mark slice.
-            mark_end = mark_start+len(group)
-
-            # Skip whitespace tokens; otherwise produce the next token.
-            if not token_class.is_ws:
-                mark = Mark(unquoted_input, mark_start, mark_end)
-                token = token_class(value, mark)
-                tokens.append(token)
-
-            # Check if we reached the final token.
-            if token_class.is_end:
-                is_end = True
-
-            # Advance the pointers to the beginning of the next token.
-            start = match.end()
-            mark_start = mark_end
-
-        return TokenStream(tokens)
-
-

src/htsql/tr/syntax.py

 
     def __str__(self):
         # Generate an HTSQL query corresponding to the node.
-        # Note that since the format identifier could split the segment
-        # expression between the selector and the filter, we have to
-        # serialize parts of the segment expression separately.
         chunks = []
         chunks.append('/')
         if self.segment is not None:
 
     def __init__(self, value, mark):
         assert isinstance(value, str)
+
         super(IdentifierSyntax, self).__init__(mark)
-
         self.value = value
 
     def __str__(self):

src/htsql/tr/token.py

     """
 
     name = 'string'
+    # Note: we do not permit `NUL` characters in a string literal.
     pattern = r""" ' (?: [^'\0] | '')* ' """
 
     @classmethod

src/htsql/translate.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-:mod:`htsql.translate`
-======================
-
-This module implements the translate utility.
-"""
-
-
-from .adapter import Utility
-from .tr.parser import QueryParser
-from .tr.binder import Binder
-from .tr.encoder import Encoder
-from .tr.assembler import Assembler
-from .tr.outliner import Outliner
-from .tr.compiler import Compiler
-from .tr.serializer import Serializer
-
-
-class Translate(Utility):
-
-    def __call__(self, uri):
-        parser = QueryParser(uri)
-        syntax = parser.parse()
-        binder = Binder()
-        binding = binder.bind_one(syntax)
-        encoder = Encoder()
-        code = encoder.encode(binding)
-        assembler = Assembler()
-        term = assembler.assemble(code)
-        outliner = Outliner()
-        sketch = outliner.outline(term)
-        compiler = Compiler()
-        frame = compiler.compile(sketch)
-        serializer = Serializer()
-        plan = serializer.serialize(frame)
-        return plan
-
-

src/htsql_sqlite/domain.py

     """
     Represents a pseudo-date type for SQLite.
 
-    In SQL, values of the ``DATE`` type are represented as ``TEXT`` values
+    In SQL, date values are expressed as ``TEXT`` values
     of the form ``YYYY-MM-DD``.
     """