Commits

Kirill Simonov  committed 31aa1e0

Added `htsql.tr` package; `tr` stands for "translator".

Moved the scanner and the parser to `htsql.tr`.

  • Participants
  • Parent commits c45e8a3

Comments (0)

Files changed (10)

File src/htsql/application.py

 
 from .error import HTTPError
 from .util import DB
-from .parser import QueryParser
+from .tr.parser import QueryParser
 import urllib
 
 

File src/htsql/parser.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-This module implements the HTSQL parser.
-
-The parser takes a stream of tokens from the scanner and produces
-a syntax tree.
-
-Here is the grammar of HTSQL::
-
-    stream          ::= query END
-
-    query           ::= '/'
-                      | '/' base format? filter?
-                      | '/' base? selector format? filter?
-    base            ::= identifier | group
-    format          ::= '.' identifier
-    filter          ::= '?' test
-
-    element         ::= test ( '+' | '-' )*
-    test            ::= and_test ( '|' and_test )*
-    and_test        ::= implies_test ( '&' implies_test )*
-    implies_test    ::= unary_test ( '->' unary_test )?
-    unary_test      ::= '!' unary_test | comparison
-
-    comparison      ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
-                                       '$~~' | '$~' | '~~' | '~' |
-                                       '!=~~' | '!=~' | '!^~~' | '!^~' |
-                                       '!$~~' | '!$~' | '!~~' | '!~' |
-                                       '<=' | '<' | '>=' |  '>' |
-                                       '==' | '=' | '!==' | '!=' )
-                                     expression )?
-
-    expression      ::= term ( ( '+' | '-' ) term )*
-    term            ::= factor ( ( '*' | identifier ) factor )*
-    factor          ::= ( '+' | '-' ) factor | power
-    power           ::= sieve ( '^' power )?
-
-    sieve           ::= specifier selector? filter?
-    specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
-    atom            ::= '*' | selector | group | identifier call? | literal
-
-    group           ::= '(' element ')'
-    call            ::= '(' elements? ')'
-    selector        ::= '{' elements? '}'
-    elements        ::= element ( ',' element )* ','?
-
-    identifier      ::= NAME
-    literal         ::= STRING | NUMBER
-
-Note that this grammar is almost LL(1); one notable exception is
-the postfix ``+`` and ``-`` operators.
-"""
-
-
-from .mark import Mark
-from .scanner import Scanner
-from .token import NameToken, StringToken, NumberToken, SymbolToken, EndToken
-from .syntax import (QuerySyntax, SegmentSyntax, FormatSyntax, SelectorSyntax,
-                     SieveSyntax, OperatorSyntax, FunctionOperatorSyntax,
-                     FunctionCallSyntax, GroupSyntax, SpecifierSyntax,
-                     IdentifierSyntax, WildcardSyntax,
-                     StringSyntax, NumberSyntax)
-
-
-class Parser(object):
-    """
-    Implements an HTSQL parser.
-
-    This is an abstract class; see subclasses of :class:`Parser` for
-    implementations of various parts of the HTSQL grammar.
-
-    `input` (a string)
-        The input HTSQL expression.
-    """
-
-    class __metaclass__(type):
-        # Implements a shortcut:
-        #   Parser << tokens
-        # to call
-        #   Parser.process(tokens)
-
-        def __lshift__(parser_class, tokens):
-            return parser_class.process(tokens)
-
-    def __init__(self, input):
-        assert isinstance(input, str)
-        self.input = input
-
-    def parse(self):
-        """
-        Parses the input expression; returns the corresponding syntax node.
-        """
-        # Tokenize the input query.
-        scanner = Scanner(self.input)
-        tokens = scanner.scan()
-        # Parse the input query.
-        syntax = self.process(tokens)
-        # Ensure that we reached the end of the token stream.
-        tokens.pop(EndToken)
-        return syntax
-
-    @classmethod
-    def process(cls, tokens):
-        """
-        Takes a stream of tokens; returns a syntax node.
-
-        This function does not have to exhaust the token stream.
-
-        `tokens` (:class:`htsql.scanner.TokenStream`)
-            The stream of tokens to parse.
-        """
-        # Override in subclasses.
-        raise NotImplementedError()
-
-
-class QueryParser(Parser):
-    """
-    Parses an HTSQL query.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parse the productions:
-        #   query           ::= '/'
-        #                     | '/' base format? filter?
-        #                     | '/' base? selector format? filter?
-        #   base            ::= identifier | group
-        #   format          ::= '.' identifier
-        #   filter          ::= '?' test
-        head_token = tokens.pop(SymbolToken, ['/'])
-        base = None
-        selector = None
-        filter = None
-        segment = None
-        format = None
-        if tokens.peek(NameToken):
-            base = IdentifierParser << tokens
-        elif tokens.peek(SymbolToken, ['(']):
-            base = GroupParser << tokens
-        if tokens.peek(SymbolToken, ['{']):
-            selector = SelectorParser << tokens
-        if base is not None or selector is not None:
-            if tokens.peek(SymbolToken, ['.'], do_pop=True):
-                identifier = IdentifierParser << tokens
-                format = FormatSyntax(identifier, identifier.mark)
-            if tokens.peek(SymbolToken, ['?'], do_pop=True):
-                filter = TestParser << tokens
-            mark = Mark.union(base, selector, filter)
-            segment = SegmentSyntax(base, selector, filter, mark)
-        mark = Mark.union(head_token, segment, format)
-        query = QuerySyntax(segment, format, mark)
-        return query
-
-
-class ElementParser(Parser):
-    """
-    Parses an `element` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   element         ::= test ( '+' | '-' )*
-        element = TestParser << tokens
-        while tokens.peek(SymbolToken, ['+', '-']):
-            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
-            symbol = symbol_token.value
-            mark = Mark.union(element, symbol_token)
-            element = OperatorSyntax(symbol, element, None, mark)
-        return element
-
-
-class TestParser(Parser):
-    """
-    Parses a `test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   test            ::= and_test ( '|' and_test )*
-        test = AndTestParser << tokens
-        while tokens.peek(SymbolToken, ['|']):
-            symbol_token = tokens.pop(SymbolToken, ['|'])
-            symbol = symbol_token.value
-            left = test
-            right = AndTestParser << tokens
-            mark = Mark.union(left, right)
-            test = OperatorSyntax(symbol, left, right, mark)
-        return test
-
-
-class AndTestParser(Parser):
-    """
-    Parses an `and_test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   and_test        ::= implies_test ( '&' implies_test )*
-        test = ImpliesTestParser << tokens
-        while tokens.peek(SymbolToken, ['&']):
-            symbol_token = tokens.pop(SymbolToken, ['&'])
-            symbol = symbol_token.value
-            left = test
-            right = ImpliesParser << tokens
-            mark = Mark.union(left, right)
-            test = OperatorSyntax(symbol, left, right, mark)
-        return test
-
-
-class ImpliesTestParser(Parser):
-    """
-    Parses an `implies_test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   implies_test    ::= unary_test ( '->' unary_test )?
-        test = UnaryTestParser << tokens
-        if not tokens.peek(SymbolToken, ['->']):
-            return test
-        symbol_token = tokens.pop(SymbolToken, ['->'])
-        symbol = symbol_token.value
-        left = test
-        right = UnaryTestParser << tokens
-        mark = Mark.union(left, right)
-        test = OperatorSyntax(symbol, left, right, mark)
-        return test
-
-
-class UnaryTestParser(Parser):
-    """
-    Parses a `unary_test` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   unary_test      ::= '!' unary_test | comparison
-        symbol_tokens = []
-        while tokens.peek(SymbolToken, ['!']):
-            symbol_token = tokens.pop(SymbolToken, ['!'])
-            symbol_tokens.append(symbol_token)
-        test = ComparisonParser << tokens
-        while symbol_tokens:
-            symbol_token = symbol_tokens.pop()
-            symbol = symbol_token.value
-            mark = Mark.union(symbol_token, test)
-            test = OperatorSyntax(symbol, None, test, mark)
-        return test
-
-
-class ComparisonParser(Parser):
-    """
-    Parses a `comparison` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   comparison  ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
-        #                                  '$~~' | '$~' | '~~' | '~' |
-        #                                  '!=~~' | '!=~' | '!^~~' | '!^~' |
-        #                                  '!$~~' | '!$~' | '!~~' | '!~' |
-        #                                  '<=' | '<' | '>=' |  '>' |
-        #                                  '==' | '=' | '!==' | '!=' )
-        #                                expression )?
-        expression = ExpressionParser << tokens
-        symbol_token = tokens.peek(SymbolToken,
-                                   ['=~~', '=~', '^~~', '^~',
-                                    '$~~', '$~', '~~', '~',
-                                    '!=~~', '!=~', '!^~~', '!^~',
-                                    '!$~~', '!$~', '!~~', '!~',
-                                    '<=', '<', '>=', '>',
-                                    '==', '=', '!==', '!='], do_pop=True)
-        if symbol_token is None:
-            return expression
-        symbol = symbol_token.value
-        left = expression
-        right = ExpressionParser << tokens
-        mark = Mark.union(left, right)
-        comparison = OperatorSyntax(symbol, left, right, mark)
-        return comparison
-
-
-class ExpressionParser(Parser):
-    """
-    Parses an `expression` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   expression      ::= term ( ( '+' | '-' ) term )*
-        expression = TermParser << tokens
-        # Here we perform a look-ahead to distinguish between productions:
-        #   element         ::= test ( '+' | '-' )*
-        # and
-        #   expression      ::= term ( ( '+' | '-' ) term )*
-        # We know that the FOLLOWS set of `element` consists of the symbols:
-        #   ',', ')', and '}',
-        # which never start the `term` non-terminal.
-        while tokens.peek(SymbolToken, ['+', '-']):
-            ahead = 1
-            while tokens.peek(SymbolToken, ['+', '-'], ahead=ahead):
-                ahead += 1
-            if tokens.peek(SymbolToken, [',', ')', '}'], ahead=ahead):
-                break
-            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
-            symbol = symbol_token.value
-            left = expression
-            right = TermParser << tokens
-            mark = Mark.union(left, right)
-            expression = OperatorSyntax(symbol, left, right, mark)
-        return expression
-
-
-class TermParser(Parser):
-    """
-    Parses a `term` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   term            ::= factor ( ( '*' | identifier ) factor )*
-        expression = FactorParser << tokens
-        while (tokens.peek(SymbolToken, ['*'])
-               or tokens.peek(NameToken)):
-            if tokens.peek(SymbolToken, ['*']):
-                symbol_token = token.pop(SymbolToken, ['*'])
-                symbol = token.value
-                left = expression
-                right = FactorParser << tokens
-                mark = Mark.union(left, right)
-                expression = OperatorSyntax(symbol, left, right, mark)
-            else:
-                identifier = IdentifierParser << tokens
-                left = expression
-                right = FactorParser << tokens
-                mark = Mark.union(left, right)
-                expression = FunctionOperatorSyntax(identifier,
-                                                    left, right, mark)
-        return expression
-
-
-class FactorParser(Parser):
-    """
-    Parses a `factor` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   factor          ::= ( '+' | '-' ) factor | power
-        symbol_tokens = []
-        while tokens.peek(SymbolToken, ['+', '-']):
-            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
-            symbol_tokens.append(symbol_token)
-        expression = PowerParser << tokens
-        while symbol_tokens:
-            symbol_token = symbol_tokens.pop()
-            symbol = symbol_token.value
-            mark = Mark.union(symbol_token, test)
-            expression = OperatorSyntax(symbol, None, expression, mark)
-        return expression
-
-
-class PowerParser(Parser):
-    """
-    Parses a `power` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   power           ::= sieve ( '^' power )?
-        expression = SieveParser << tokens
-        # Note that there is some grammar ambiguity here: if the sieve
-        # contains a filter part, the sieve parser will greedily consume
-        # any `^` expression.
-        if not tokens.peek(SymbolToken, ['^']):
-            return expression
-        symbol_token = tokens.pop(SymbolToken, ['^'])
-        symbol = symbol_token.value
-        left = expression
-        right = PowerParser << tokens
-        mark = Mark.union(left, right)
-        expression = OperatorSyntax(symbol, None, expression, mark)
-        return expression
-
-
-class SieveParser(Parser):
-    """
-    Parses a `sieve` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   sieve           ::= specifier selector? filter?
-        expression = SpecifierParser << tokens
-        selector = None
-        filter = None
-        if tokens.peek(SymbolToken, ['{']):
-            selector = SelectorParser << tokens
-        if tokens.peek(SymbolToken, ['?']):
-            tokens.pop(SymbolToken, ['?'])
-            filter = TestParser << tokens
-        if selector is None and filter is None:
-            return expression
-        mark = Mark.union(expression, selector, filter)
-        expression = SieveSyntax(expression, selector, filter, mark)
-        return expression
-
-
-class SpecifierParser(Parser):
-    """
-    Parses a `specifier` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
-        #   call            ::= '(' elements? ')'
-        #   elements        ::= element ( ',' element )* ','?
-        expression = AtomParser << tokens
-        while tokens.peek(SymbolToken, ['.']):
-            head_token = tokens.pop(SymbolToken, ['.'])
-            if tokens.peek(SymbolToken, ['*']):
-                symbol_token = tokens.pop(SymbolToken, ['*'])
-                wildcard = WildcardSyntax(symbol_token.mark)
-                mark = Mark.union(head_token, wildcard)
-                expression = SpecifierSyntax(expression, wildcard, mark)
-                break
-            else:
-                identifier = IdentifierParser << tokens
-                if tokens.peek(SymbolToken, ['(']):
-                    tokens.pop(SymbolToken, ['('])
-                    arguments = []
-                    while not tokens.peek(SymbolToken, [')']):
-                        argument = ElementParser << tokens
-                        arguments.append(argument)
-                        if not tokens.peek(SymbolToken, [')']):
-                            tokens.pop(SymbolToken, [',', ')'])
-                    tail_token = tokens.pop(SymbolToken, [')'])
-                    mark = Mark.union(head_token, tail_token)
-                    expression = FunctionCallSyntax(expression, identifier,
-                                                    arguments, mark)
-                else:
-                    mark = Mark.union(head_token, identifier)
-                    expression = SpecifierSyntax(expression, identifier, mark)
-        return expression
-
-
-class AtomParser(Parser):
-    """
-    Parses an `atom` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   atom        ::= '*' | selector | group | identifier call? | literal
-        #   call        ::= '(' elements? ')'
-        #   elements    ::= element ( ',' element )* ','?
-        #   literal     ::= STRING | NUMBER
-        if tokens.peek(SymbolToken, ['*']):
-            symbol_token = tokens.pop(SymbolToken, ['*'])
-            wildcard = WildcardSyntax(symbol_token.mark)
-            return wildcard
-        elif tokens.peek(SymbolToken, ['(']):
-            group = GroupParser << tokens
-            return group
-        elif tokens.peek(SymbolToken, ['{']):
-            selector = SelectorParser << tokens
-            return selector
-        elif tokens.peek(NameToken):
-            identifier = IdentifierParser << tokens
-            if tokens.peek(SymbolToken, ['(']):
-                tokens.pop(SymbolToken, ['('])
-                arguments = []
-                while not tokens.peek(SymbolToken, [')']):
-                    argument = ElementParser << tokens
-                    arguments.append(argument)
-                    if not tokens.peek(SymbolToken, [')']):
-                        tokens.pop(SymbolToken, [',', ')'])
-                tail_token = tokens.pop(SymbolToken, [')'])
-                mark = Mark.union(identifier, tail_token)
-                expression = FunctionCallSyntax(None, identifier,
-                                                arguments, mark)
-                return expression
-            else:
-                return identifier
-        elif tokens.peek(StringToken):
-            token = tokens.pop(StringToken)
-            return StringSyntax(token.value, token.mark)
-        elif tokens.peek(NumberToken):
-            token = tokens.pop(NumberToken)
-            return NumberSyntax(token.value, token.mark)
-        # We expect it to always produce an error message.
-        tokens.pop(NameToken)
-        # Not reachable.
-        assert False
-
-
-class GroupParser(Parser):
-    """
-    Parses a `group` production.
-    """
-
-    @classmethod
-    def process(self, tokens):
-        # Parses the production:
-        #   group           ::= '(' element ')'
-        head_token = tokens.pop(SymbolToken, ['('])
-        expression = ElementParser << tokens
-        tail_token = tokens.pop(SymbolToken, [')'])
-        mark = Mark.union(head_token, tail_token)
-        group = GroupSyntax(expression, mark)
-        return group
-
-
-class SelectorParser(Parser):
-    """
-    Parses a `selector` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the productions:
-        #   selector        ::= '{' elements? '}'
-        #   elements        ::= element ( ',' element )* ','?
-        head_token = tokens.pop(SymbolToken, ['{'])
-        elements = []
-        while not tokens.peek(SymbolToken, ['}']):
-            element = ElementParser << tokens
-            elements.append(element)
-            if not tokens.peek(SymbolToken, ['}']):
-                # We know it's not going to be '}', but we put it into the list
-                # of accepted values to generate a better error message.
-                tokens.pop(SymbolToken, [',', '}'])
-        tail_token = tokens.pop(SymbolToken, ['}'])
-        mark = Mark.union(head_token, tail_token)
-        selector = SelectorSyntax(elements, mark)
-        return selector
-
-
-class IdentifierParser(Parser):
-    """
-    Parser an `identifier` production.
-    """
-
-    @classmethod
-    def process(cls, tokens):
-        # Parses the production:
-        #   identifier      ::= NAME
-        name_token = tokens.pop(NameToken)
-        identifier = IdentifierSyntax(name_token.value, name_token.mark)
-        return identifier
-
-

File src/htsql/scanner.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-This module implements the HTSQL scanner.
-
-The scanner tokenizes the input query and produces a stream of tokens.
-
-The first step of scanning is decoding ``%``-escape sequences.  Then
-the scanner splits the input query to a list of tokens.  The following
-token types are emitted:
-
-*NAME*
-    Represents an HTSQL identifier: a sequence of alphanumeric characters
-    that does not start with a digit.  Alphanumeric characters include
-    characters ``a``-``z``, ``A``-``Z``, ``0``-``9``, ``_``, and those
-    Unicode characters that are classified as alphanumeric.
-
-*NUMBER*
-    Represents a number literal: integer, float and scientific notations
-    are recognized.
-
-*STRING*
-    Represents a string literal enclosed in single quotes; any single
-    quote character should be doubled.
-
-*SYMBOL*
-    Represents a valid symbol in HTSQL grammar; one of the following
-    symbols:
-
-    *comparison operators*
-        ``=``, ``!=``, ``==``, ``!==``, ``~``, ``!~``,
-        ``~~``, ``!~~``, ``^~``, ``!^~``, ``^~~``, ``!^~~``,
-        ``$~``, ``!$~``, ``$~~``, ``!$~~``, ``=~``, ``!=~``,
-        ``=~~``, ``!=~~``. ``<``, ``<=``, ``>``, ``>=``.
-
-    *logical operators*
-        ``!``, ``&``, ``|``, ``->``, ``?``.
-
-    *arithmetic operators*
-        ``+``, ``-``, ``*``, ``^``.
-
-    *punctuation*
-        ``(``, ``)``, ``[``, ``]``, ``{``, ``}``,
-        ``.``, ``,``, ``/``.
-
-There are also two special token types:
-
-*WHITESPACE*
-    Represents a whitespace character, one of: space, tab, LF, CR, FF, VT,
-    and those Unicode characters that are classified as space.  The
-    whitespace token are immediately discarded by the scanner and never
-    emitted.
-
-*END*
-    Indicates the end of the query; it is always the last token emitted.
-"""
-
-
-from token import (Token, SpaceToken, NameToken, StringToken, NumberToken,
-                   SymbolToken, EndToken)
-from mark import Mark
-from error import InvalidSyntaxError
-from util import maybe, listof
-import re
-
-
-class TokenStream(object):
-    """
-    A sequence of tokens.
-
-    :class:`TokenStream` wraps a list of tokens with a convenient interface
-    for consumption and look-ahead.
-
-    `tokens` (a list of :`class:htsql.token.Token` objects)
-        A list of tokens.
-    """
-
-    def __init__(self, tokens):
-        assert isinstance(tokens, listof(Token))
-
-        # The list of tokens.
-        self.tokens = tokens
-        # The position of the active token.
-        self.idx = 0
-
-    def peek(self, token_class=None, values=None, ahead=0,
-             do_pop=False, do_force=False):
-        """
-        Returns the active token.
-
-        If the parameter `ahead` is non-zero, the method will return
-        the next `ahead` token after the active one.
-
-        When `token_class` is set, the method checks if the token is an
-        instance of the given class.  When `values` is set, the method
-        checks if the token value belongs to the given list of values.
-        If any of the checks fail, the method either raises
-        :exc:`htsql.error.InvaludSyntaxError` or
-        returns ``None`` depending on the value of the `do_force`
-        parameter.
-
-        This method advances the active pointer to the next token if
-        `do_pop` is enabled.
-
-        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
-            If not ``None``, the method checks that the returned token
-            is an instance of `token_class`.
-
-        `values` (a list of strings or ``None``)
-            If not ``None``, the method checks that the value of the
-            returned token belongs to the list.
-
-        `ahead` (an integer)
-            The position of the returned token relative to the active one.
-
-        `do_pop` (Boolean)
-            If set, the method will advance the active pointer.
-
-        `do_force` (Boolean)
-            This flag affects the method behavior when any of the token
-            checks fail.  If set, the method will raise
-            :exc:`htsql.error.InvaludSyntaxError`; otherwise it will return
-            ``None``.
-        """
-        # Sanity check on the arguments.
-        assert token_class is None or issubclass(token_class, Token)
-        assert isinstance(values, maybe(listof(str)))
-        assert token_class is not None or values is None
-        assert isinstance(ahead, int) and ahead >= 0
-        assert self.idx+ahead < len(self.tokens)
-        assert isinstance(do_pop, bool)
-        assert isinstance(do_force, bool)
-
-        # Get the token we are going to return.  It is the responsibility
-        # of the caller to ensure that the index is in the list bounds.
-        token = self.tokens[self.idx+ahead]
-
-        # Indicates if the token passed the given tests.
-        is_expected = True
-        if token_class is not None:
-            # Check the token type.
-            if not isinstance(token, token_class):
-                is_expected = False
-            else:
-                if values is not None:
-                    # Check the token value.
-                    if token.value not in values:
-                        is_expected = False
-
-        # The token failed the checks.
-        if not is_expected:
-            # If `do_force` is not set, return `None`; otherwise generate
-            # an error message of the form:
-            #   expected {token_class.NAME} ('{values[0]}', ...);
-            #   got {token.NAME} '{token.value}'
-            if not do_force:
-                return None
-            expected = "%s" % token_class.name.upper()
-            if values:
-                if len(values) == 1:
-                    expected = "%s %r" % (expected, values[0])
-                else:
-                    expected = "%s (%s)" % (expected,
-                                            ", ".join(repr(value)
-                                                      for value in values))
-            got = "%s %r" % (token.name.upper(), token.value)
-            raise InvalidSyntaxError("expected %s; got %s" % (expected, got),
-                                     token.mark)
-        # Advance the pointer.
-        if do_pop:
-            self.idx += ahead+1
-
-        return token
-
-    def pop(self, token_class=None, values=None):
-        """
-        Returns the active token and advances the pointer to the next token.
-
-        When `token_class` is set, the method checks if the token is an
-        instance of the given class.  When `values` is set, the method
-        checks if the token value belongs to the given list of values.
-        If any of the checks fail, the method raises
-        :exc:`htsql.error.InvaludSyntaxError`.
-
-        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
-            If not ``None``, the method checks that the active token
-            is an instance of `token_class`.
-
-        `values` (a list of strings or ``None``)
-            If not ``None``, the method checks that the value of the active
-            token belongs to the list.
-        """
-        return self.peek(token_class, values,
-                         do_pop=True, do_force=True)
-
-
-class Scanner(object):
-    """
-    Implements the HTSQL scanner.
-
-    `input`
-        An HTSQL query.
-    """
-
-    # List of tokens generated by the scanner.
-    tokens = [
-            SpaceToken,
-            NameToken,
-            NumberToken,
-            StringToken,
-            SymbolToken,
-            EndToken,
-    ]
-    # A regular expression that matches an HTSQL token.  The expression
-    # is compiled from individual token patterns on the first instantiation
-    # of the scanner.
-    regexp = None
-
-    # The regular expression to match %-escape sequences.
-    escape_pattern = r"""%(?P<code>[0-9A-Fa-f]{2})?"""
-    escape_regexp = re.compile(escape_pattern)
-
-    def __init__(self, input):
-        assert isinstance(input, str)
-
-        self.input = input
-        self.init_regexp()
-
-    @classmethod
-    def init_regexp(cls):
-        # Compile the regular expression that matches all token types.
-
-        # If it is already done, return.
-        if cls.regexp is not None:
-            return
-
-        # Get patterns for each token type grouped by the class name.
-        token_patterns = []
-        for token_class in cls.tokens:
-            pattern = r"(?P<%s> %s)" % (token_class.name, token_class.pattern)
-            token_patterns.append(pattern)
-
-        # Combine the individual patterns and compile the expression.
-        pattern = r"|".join(token_patterns)
-        cls.regexp = re.compile(pattern, re.X|re.U)
-
-    def unquote(self, data):
-        # Decode %-escape sequences.
-
-        def replace(match):
-            # Two hexdecimal digits that encode a byte value.
-            code = match.group('code')
-            # Complain if we get `%` not followed by two hexdecimal digits.
-            if not code:
-                mark = Mark(match.string, match.begin(), match.end())
-                raise InvalidSyntaxError("invalid escape sequence", mark)
-            # Return the character corresponding to the escape sequence.
-            return chr(int(code, 16))
-
-        # Apply `replace` to all `%`-escape sequences and return the result.
-        data = self.escape_regexp.sub(replace, data)
-        return data
-
-    def scan(self):
-        """
-        Tokenizes the query; returns a :class:`TokenStream` instance.
-
-        In case of syntax errors, raises :exc:`htsql.error.InvalidSyntaxError`.
-        """
-        # Decode %-escape sequences.
-        unquoted_input = self.unquote(self.input)
-        # Since we want the `\w` pattern to match Unicode characters
-        # classified as alphanumeric, we have to convert the input query
-        # to Unicode.
-        try:
-            decoded_input = unquoted_input.decode('utf-8')
-        except UnicodeDecodeError, exc:
-            mark = Mark(unquoted_input, exc.start, exc.end)
-            raise InvalidSyntaxError("cannot decode an UTF-8 character: %s"
-                                     % exc.reason, mark)
-
-        # The beginning of the next token.
-        start = 0
-        # The beginning of the mark slice.  Note that both `start` and
-        # `mark_start` point to the same position in the query string;
-        # however `start` is measured in Unicode characters while
-        # `mark_start` is measured in octets.
-        mark_start = 0
-        # Have we reached the final token?
-        is_end = False
-        # The list of generated tokens.
-        tokens = []
-
-        # Loop till we get the final token.
-        while not is_end:
-            # Match the next token.
-            match = self.regexp.match(decoded_input, start)
-            if match is None:
-                mark = Mark(unquoted_input, mark_start, mark_start+1)
-                raise InvalidSyntaxError("unexpected character %r" %
-                                         decoded_input[start].encode('utf-8'),
-                                         mark)
-
-            # Find the token class that matched the token.
-            for token_class in self.tokens:
-                group = match.group(token_class.name)
-                if group is not None:
-                    break
-            else:
-                # Unreachable.
-                assert False
-
-            # The raw string that matched the token pattern.
-            group = group.encode('utf-8')
-            # Unquote the token value.
-            value = token_class.unquote(group)
-            # The end of the mark slice.
-            mark_end = mark_start+len(group)
-
-            # Skip whitespace tokens; otherwise produce the next token.
-            if not token_class.is_ws:
-                mark = Mark(unquoted_input, mark_start, mark_end)
-                token = token_class(value, mark)
-                tokens.append(token)
-
-            # Check if we reached the final token.
-            if token_class.is_end:
-                is_end = True
-
-            # Advance the pointers to the beginning of the next token.
-            start = match.end()
-            mark_start = mark_end
-
-        return TokenStream(tokens)
-
-

File src/htsql/syntax.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-This module defines syntax nodes for the HTSQL grammar.
-"""
-
-
-from .mark import Mark
-from .util import maybe, listof
-import re
-
-
-class Syntax(object):
-    """
-    Represents a syntax node.
-
-    `mark` (:class:`htsql.mark.Mark`)
-        The location of the node in the original query.
-    """
-
-    # The pattern to %-escape unsafe characters.
-    escape_pattern = r"[\x00-\x1F%\x7F]"
-    escape_regexp = re.compile(escape_pattern)
-    escape_replace = (lambda m: "%%%02X" % ord(m.group()))
-
-    def __init__(self, mark):
-        assert isinstance(mark, Mark)
-        self.mark = mark
-
-    def __str__(self):
-        """
-        Returns an HTSQL expression that would produce the same syntax node.
-        """
-        # Override when subclassing.
-        raise NotImplementedError()
-
-    def __repr__(self):
-        return "<%s %s>" % (self.__class__.__name__, self)
-
-
-class QuerySyntax(Syntax):
-    """
-    Represents an HTSQL query.
-
-    An HTSQL query consists of a segment expression and a format identifier.
-
-    `segment` (:class:`SegmentSyntax` or ``None``)
-        The segment expression.
-
-    `format` (:class:`FormatSyntax` or ``None``)
-        The query format.
-    """
-
-    def __init__(self, segment, format, mark):
-        assert isinstance(segment, maybe(SegmentSyntax))
-        assert isinstance(format, maybe(FormatSyntax))
-
-        super(QuerySyntax, self).__init__(mark)
-        self.segment = segment
-        self.format = format
-
-    def __str__(self):
-        # Generate an HTSQL query corresponding to the node.
-        # Note that since the format identifier could split the segment
-        # expression between the selector and the filter, we have to
-        # serialize parts of the segment expression separately.
-        chunks = []
-        chunks.append('/')
-        if self.segment is not None:
-            if self.segment.base is not None:
-                chunks.append(str(self.segment.base))
-            if self.segment.selector is not None:
-                chunks.append(str(self.segment.selector))
-            if self.format is not None:
-                chunks.append('.')
-                chunks.append(str(self.format))
-            if self.segment.filter is not None:
-                chunks.append('?')
-                chunks.append(str(self.segment.filter))
-        return ''.join(chunks)
-
-
-class SegmentSyntax(Syntax):
-    """
-    Represents a segment expression.
-
-    A segment expression consists of the base expression, the selector,
-    and the filter::
-
-        /base{selector}?filter
-
-    `base` (:class:`Syntax` or ``None``)
-        The base expression.
-
-    `selector` (:class:`SelectorSyntax` or ``None``)
-        The selector.
-
-    `filter` (:class:`Syntax` or ``None``)
-        The filter expression.
-    """
-
-    def __init__(self, base, selector, filter, mark):
-        assert isinstance(base, maybe(Syntax))
-        assert isinstance(selector, maybe(SelectorSyntax))
-        assert isinstance(filter, maybe(Syntax))
-
-        super(SegmentSyntax, self).__init__(mark)
-        self.base = base
-        self.selector = selector
-        self.filter = filter
-
-    def __str__(self):
-        # Generate an HTSQL expression of the form:
-        #   base{selector}?filter
-        chunks = []
-        if self.base is not None:
-            chunks.append(str(self.base))
-        if self.selector is not None:
-            chunks.append(str(self.selector))
-        if self.filter is not None:
-            chunks.append('?')
-            chunks.append(str(self.filter))
-        return ''.join(chunks)
-
-
-class FormatSyntax(Syntax):
-    """
-    Represents the format identifier.
-
-    `identifier`
-        The format identifier.
-    """
-    # Note that essentially `FormatSyntax` is a wrapper over
-    # `IdentifierSyntax`.  We wrap the format identifier into
-    # a separate node to enable adapting by the node type.
-
-    def __init__(self, identifier, mark):
-        assert isinstance(identifier, IdentifierSyntax)
-
-        super(FormatSyntax, self).__init__(mark)
-        self.identifier = identifier
-
-    def __str__(self):
-        return str(self.identifier)
-
-
-class SelectorSyntax(Syntax):
-    """
-    Represents a selector expression.
-
-    A selector is a sequence of elements::
-
-        {element, ...}
-
-    `elements` (a list of :class:`Syntax`)
-        The list of selector elements.
-    """
-
-    def __init__(self, elements, mark):
-        assert isinstance(elements, listof(Syntax))
-
-        super(SelectorSyntax, self).__init__(mark)
-        self.elements = elements
-
-    def __str__(self):
-        # Generate an expression of the form:
-        #   {element,...}
-        return '{%s}' % ','.join(str(element)
-                                 for element in self.elements)
-
-
-class SieveSyntax(Syntax):
-    """
-    Represents a sieve expression.
-
-    A sieve expression has the same shape as the segment expression.
-    It consists of the base, the selector and the filter::
-
-        base{selector}?filter
-
-    `base` (:class:`Syntax`)
-        The sieve base expression.
-
-    `selector` (:class:`SelectorSyntax` or ``None``)
-        The sieve selector.
-
-    `filter` (:class:`Syntax` or ``None``)
-        The sieve filter expression.
-    """
-
-    def __init__(self, base, selector, filter, mark):
-        assert isinstance(base, Syntax)
-        assert isinstance(selector, maybe(SelectorSyntax))
-        assert isinstance(filter, maybe(Syntax))
-        assert selector is not None or filter is not None
-
-        super(SieveSyntax, self).__init__(mark)
-        self.base = base
-        self.selector = selector
-        self.filter = filter
-
-    def __str__(self):
-        # Generate an expression of the form:
-        #   base{selector}?filter
-        chunks = []
-        chunks.append(str(self.base))
-        if self.selector is not None:
-            chunks.append(str(self.selector))
-        if self.filter is not None:
-            chunks.append('?')
-            chunks.append(str(self.filter))
-        return ''.join(chunks)
-
-
-class OperatorSyntax(Syntax):
-    """
-    Represents an operator expression.
-
-    An operator expression has one of the following forms::
-
-        left <symbol> right
-        left <symbol>
-        <symbol> right
-
-    `symbol` (a string)
-        The operator.
-
-    `left` (:class:`Syntax` or ``None``)
-        The left argument.
-
-    `right` (:class:`Syntax` or ``None``)
-        The right argument.
-    """
-
-    def __init__(self, symbol, left, right, mark):
-        assert isinstance(symbol, str)
-        assert isinstance(left, maybe(Syntax))
-        assert isinstance(right, maybe(Syntax))
-        assert left is not None or right is not None
-        super(OperatorSyntax, self).__init__(mark)
-        self.symbol = symbol
-        self.left = left
-        self.right = right
-
-    def __str__(self):
-        # Generate an expression of the form:
-        #   left<symbol>right
-        chunks = []
-        if self.left is not None:
-            chunks.append(str(self.left))
-        chunks.append(self.symbol)
-        if self.right is not None:
-            chunks.append(str(self.right))
-        return ''.join(chunks)
-
-
-class FunctionOperatorSyntax(Syntax):
-    """
-    Represents a binary function call in the operator form.
-
-    This expression has the form::
-
-        left identifier right
-
-    and is equivalent to the expression::
-
-        identifier(left, right)
-
-    `identifier` (:class:`IdentifierSyntax`)
-        The function name.
-
-    `left` (:class:`Syntax`)
-        The first argument.
-
-    `right` (:class:`Syntax`)
-        The second argument.
-    """
-
-    def __init__(self, identifier, left, right, mark):
-        assert isinstance(identifier, IdentifierSyntax)
-        assert isinstance(left, Syntax)
-        assert isinstance(right, Syntax)
-
-        super(FunctionOperatorSyntax, self).__init__(mark)
-        self.identifier = identifier
-        self.left = left
-        self.right = right
-
-    def __str__(self):
-        # Generate an expression of the form:
-        #   left identifier right
-        chunks = []
-        if self.left is not None:
-            chunks.append(str(self.left))
-            chunks.append(' ')
-        chunks.append(str(self.identifier))
-        if self.right is not None:
-            chunks.append(' ')
-            chunks.append(str(self.right))
-        return ''.join(chunks)
-
-
-class FunctionCallSyntax(Syntax):
-    """
-    Represents a function or a method call.
-
-    This expression has one of the forms::
-
-        identifier(arguments)
-        base.identifier(arguments)
-
-    `base` (:class:`Syntax` or ``None``)
-        The method base.
-
-    `identifier` (:class:`IdentifierSyntax`)
-        The function name.
-
-    `arguments` (a list of :class:`Syntax`)
-        The list of arguments.
-    """
-
-    def __init__(self, base, identifier, arguments, mark):
-        assert isinstance(base, maybe(Syntax))
-        assert isinstance(identifier, IdentifierSyntax)
-        assert isinstance(arguments, listof(Syntax))
-
-        super(FunctionCallSyntax, self).__init__(mark)
-        self.base = base
-        self.identifier = identifier
-        self.arguments = arguments
-
-    def __str__(self):
-        # Generate an expression of the form:
-        #   base.identifier(arguments)
-        chunks = []
-        if self.base is not None:
-            chunks.append(str(self.base))
-            chunks.append('.')
-        chunks.append(str(self.identifier))
-        chunks.append('(%s)' % ','.join(str(argument)
-                                        for argument in self.arguments))
-        return ''.join(chunks)
-
-
-class GroupSyntax(Syntax):
-    """
-    Represents an expression in parentheses.
-
-    `expression` (:class:`Syntax`)
-        The expression.
-    """
-    # We keep the parentheses in the syntax tree to ease the reverse
-    # translation from the syntax tree to an HTSQL query.
-
-    def __init__(self, expression, mark):
-        assert isinstance(expression, Syntax)
-
-        super(GroupSyntax, self).__init__(mark)
-        self.expression = expression
-
-    def __str__(self):
-        return '(%s)' % self.expression
-
-
-class SpecifierSyntax(Syntax):
-    """
-    Represents a specifier expression.
-
-    A specifier expression has one of the forms::
-
-        base.identifier
-        base.*
-
-    `base` (:class:`Syntax`)
-        The specifier base.
-
-    `identifier` (:class:`IdentifierSyntax` or :class:`WildcardSyntax`)
-        The specifier identifier.
-    """
-
-    def __init__(self, base, identifier, mark):
-        assert isinstance(base, Syntax)
-        assert isinstance(identifier, (IdentifierSyntax, WildcardSyntax))
-
-        super(SpecifierSyntax, self).__init__(mark)
-        self.base = base
-        self.identifier = identifier
-
-    def __str__(self):
-        return '%s.%s' % (self.base, self.identifier)
-
-
-class IdentifierSyntax(Syntax):
-    """
-    Represents an identifier.
-
-    `value` (a string)
-        The identifier.
-    """
-
-    def __init__(self, value, mark):
-        assert isinstance(value, str)
-        super(IdentifierSyntax, self).__init__(mark)
-
-        self.value = value
-
-    def __str__(self):
-        return self.value
-
-
-class WildcardSyntax(Syntax):
-    """
-    Represents a wildcard.
-    """
-
-    def __str__(self):
-        return '*'
-
-
-class LiteralSyntax(Syntax):
-    """
-    Represents a literal expression.
-
-    This is an abstract class with subclasses :class:`StringSyntax` and
-    :class:`NumberSyntax`.
-
-    `value` (a string)
-        The value.
-    """
-
-    def __init__(self, value, mark):
-        assert isinstance(value, str)
-        super(LiteralSyntax, self).__init__(mark)
-        self.value = value
-
-
-class StringSyntax(LiteralSyntax):
-    """
-    Represents a string literal.
-    """
-
-    def __str__(self):
-        # Quote and %-encode the value.
-        value = '\'%s\'' % self.value.replace('\'', '\'\'')
-        value = self.escape_regexp.sub(self.escape_replace, value)
-        return value
-
-
-class NumberSyntax(LiteralSyntax):
-    """
-    Represents a number literal.
-    """
-
-    def __str__(self):
-        return self.value
-
-

File src/htsql/token.py

-#
-# Copyright (c) 2006-2010, Prometheus Research, LLC
-# Authors: Clark C. Evans <cce@clarkevans.com>,
-#          Kirill Simonov <xi@resolvent.net>
-#
-
-
-"""
-This module defines token types used by the HTSQL scanner.
-"""
-
-
-from mark import Mark
-from util import maybe
-
-
-class Token(object):
-    """
-    Represents a lexical token.
-
-    This is an abstract class.  To add a concrete token type, create a
-    subclass of :class:`Token` and override the following class attributes:
-
-    `name` (a string)
-        The name of the token category.  Must be a valid identifier since
-        it is used by the scanner as the name of the pattern group.
-
-    `pattern` (a string)
-        The regular expression to match the token (in the verbose format).
-
-    `is_ws` (Boolean)
-        If set, indicates that the token is to be discarded.
-
-    `is_end` (Boolean)
-        If set, forces the scanner to stop the processing.
-
-    When adding a subclass of :class:`Token`, you may also want to override
-    methods :meth:`unquote` and :meth:`quote`.
-
-    The constructor of :class:`Token` accepts the following parameters:
-
-    `value` (a string)
-        The token value.
-
-    `mark` (:class:`htsql.mark.Mark`)
-        The location of the token in the original query.
-    """
-
-    name = None
-    pattern = None
-    is_ws = False
-    is_end = False
-
-    @classmethod
-    def unquote(cls, value):
-        """
-        Converts a raw string that matches the token pattern to a token value.
-        """
-        return value
-
-    @classmethod
-    def quote(cls, value):
-        """
-        Reverses :meth:`unquote`.
-        """
-        return value
-
-    def __init__(self, value, mark):
-        assert isinstance(value, str)
-        assert isinstance(mark, Mark)
-
-        self.value = value
-        self.mark = mark
-
-    def __str__(self):
-        return self.value
-
-    def __repr__(self):
-        return "<%s %r>" % (self.__class__.__name__, str(self))
-
-
-class SpaceToken(Token):
-    """
-    Represents a whitespace token.
-
-    In HTSQL, whitespace characters are space, tab, LF, CR, FF, VT and
-    those Unicode characters that are classified as space.
-
-    Whitespace tokens are discarded by the scanner without passing them
-    to the parser.
-    """
-
-    name = 'whitespace'
-    pattern = r""" \s+ """
-    # Do not pass the token to the scanner.
-    is_ws = True
-
-
-class NameToken(Token):
-    """
-    Represents a name token.
-
-    In HTSQL, a name is a sequence of alphanumeric characters
-    that does not start with a digit.  Alphanumeric characters include
-    characters ``a``-``z``, ``A``-``Z``, ``0``-``9``, ``_``, and those
-    Unicode characters that are classified as alphanumeric.
-    """
-
-    name = 'name'
-    pattern = r""" (?! \d) \w+ """
-
-
-class StringToken(Token):
-    """
-    Represents a string literal token.
-
-    In HTSQL, a string literal is a sequence of arbitrary characters
-    enclosed in single quotes (``'``).  Use a pair of single quote
-    characters (``''``) to represent a single quote in a string.
-    """
-
-    name = 'string'
-    pattern = r""" ' (?: [^'\0] | '')* ' """
-
-    @classmethod
-    def unquote(cls, value):
-        # Strip leading and trailing quotes and replace `''` with `'`.
-        return value[1:-1].replace('\'\'', '\'')
-
-    @classmethod
-    def quote(cls, value):
-        # Replace all occurences of `'` with `''`, enclose the string
-        # in the quotes.
-        return '\'%s\'' % value.replace('\'', '\'\'')
-
-
-class NumberToken(Token):
-    """
-    Represents a number literal token.
-
-    HTSQL supports number literals in integer, float and scientific notations.
-    """
-
-    name = 'number'
-    pattern = r""" (?: \d* \.)? \d+ [eE] [+-]? \d+ | \d* \. \d+ | \d+ \.? """
-
-
-class SymbolToken(Token):
-    """
-    Represents a symbol token.
-
-    HTSQL employs the following groups of symbols:
-
-    *comparison operators*
-        ``=``, ``!=``, ``==``, ``!==``, ``~``, ``!~``,
-        ``~~``, ``!~~``, ``^~``, ``!^~``, ``^~~``, ``!^~~``,
-        ``$~``, ``!$~``, ``$~~``, ``!$~~``, ``=~``, ``!=~``,
-        ``=~~``, ``!=~~``. ``<``, ``<=``, ``>``, ``>=``.
-
-    *logical operators*
-        ``!``, ``&``, ``|``, ``->``, ``?``.
-
-    *arithmetic operators*
-        ``+``, ``-``, ``*``, ``^``.
-
-    *punctuation*
-        ``(``, ``)``, ``[``, ``]``, ``{``, ``}``,
-        ``.``, ``,``, ``/``.
-    """
-
-    name = 'symbol'
-    pattern = r"""
-        =~~ | =~ | \^~~ | \^~ | \$~~ | \$~ | ~~ | ~ |
-        !=~~ | !=~ | !\^~~ | !\^~ | !\$~~ | !\$~ | !~~ | !~ |
-        <= | < | >= | > | == | = | !== | != | ! |
-        & | \| | -> | \. | , | \? | \^ | / | \* | \+ | - |
-        \( | \) | \{ | \} | \[ | \]
-    """
-
-
-class EndToken(Token):
-    """
-    Represents the end token.
-
-    The end token is emitted when the scanner reached the end of the input
-    string and forces the scanner to stop.
-    """
-
-    name = 'end'
-    pattern = r""" $ """
-    is_end = True
-
-

File src/htsql/tr/__init__.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This package implements the HTSQL-to-SQL translator.
+"""
+
+

File src/htsql/tr/parser.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module implements the HTSQL parser.
+
+The parser takes a stream of tokens from the scanner and produces
+a syntax tree.
+
+Here is the grammar of HTSQL::
+
+    stream          ::= query END
+
+    query           ::= '/'
+                      | '/' base format? filter?
+                      | '/' base? selector format? filter?
+    base            ::= identifier | group
+    format          ::= '.' identifier
+    filter          ::= '?' test
+
+    element         ::= test ( '+' | '-' )*
+    test            ::= and_test ( '|' and_test )*
+    and_test        ::= implies_test ( '&' implies_test )*
+    implies_test    ::= unary_test ( '->' unary_test )?
+    unary_test      ::= '!' unary_test | comparison
+
+    comparison      ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
+                                       '$~~' | '$~' | '~~' | '~' |
+                                       '!=~~' | '!=~' | '!^~~' | '!^~' |
+                                       '!$~~' | '!$~' | '!~~' | '!~' |
+                                       '<=' | '<' | '>=' |  '>' |
+                                       '==' | '=' | '!==' | '!=' )
+                                     expression )?
+
+    expression      ::= term ( ( '+' | '-' ) term )*
+    term            ::= factor ( ( '*' | identifier ) factor )*
+    factor          ::= ( '+' | '-' ) factor | power
+    power           ::= sieve ( '^' power )?
+
+    sieve           ::= specifier selector? filter?
+    specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
+    atom            ::= '*' | selector | group | identifier call? | literal
+
+    group           ::= '(' element ')'
+    call            ::= '(' elements? ')'
+    selector        ::= '{' elements? '}'
+    elements        ::= element ( ',' element )* ','?
+
+    identifier      ::= NAME
+    literal         ::= STRING | NUMBER
+
+Note that this grammar is almost LL(1); one notable exception is
+the postfix ``+`` and ``-`` operators.
+"""
+
+
+from ..mark import Mark
+from .scanner import Scanner
+from .token import NameToken, StringToken, NumberToken, SymbolToken, EndToken
+from .syntax import (QuerySyntax, SegmentSyntax, FormatSyntax, SelectorSyntax,
+                     SieveSyntax, OperatorSyntax, FunctionOperatorSyntax,
+                     FunctionCallSyntax, GroupSyntax, SpecifierSyntax,
+                     IdentifierSyntax, WildcardSyntax,
+                     StringSyntax, NumberSyntax)
+
+
+class Parser(object):
+    """
+    Implements an HTSQL parser.
+
+    This is an abstract class; see subclasses of :class:`Parser` for
+    implementations of various parts of the HTSQL grammar.
+
+    `input` (a string)
+        The input HTSQL expression.
+    """
+
+    class __metaclass__(type):
+        # Implements a shortcut:
+        #   Parser << tokens
+        # to call
+        #   Parser.process(tokens)
+
+        def __lshift__(parser_class, tokens):
+            return parser_class.process(tokens)
+
+    def __init__(self, input):
+        assert isinstance(input, str)
+        self.input = input
+
+    def parse(self):
+        """
+        Parses the input expression; returns the corresponding syntax node.
+        """
+        # Tokenize the input query.
+        scanner = Scanner(self.input)
+        tokens = scanner.scan()
+        # Parse the input query.
+        syntax = self.process(tokens)
+        # Ensure that we reached the end of the token stream.
+        tokens.pop(EndToken)
+        return syntax
+
+    @classmethod
+    def process(cls, tokens):
+        """
+        Takes a stream of tokens; returns a syntax node.
+
+        This function does not have to exhaust the token stream.
+
+        `tokens` (:class:`htsql.scanner.TokenStream`)
+            The stream of tokens to parse.
+        """
+        # Override in subclasses.
+        raise NotImplementedError()
+
+
+class QueryParser(Parser):
+    """
+    Parses an HTSQL query.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parse the productions:
+        #   query           ::= '/'
+        #                     | '/' base format? filter?
+        #                     | '/' base? selector format? filter?
+        #   base            ::= identifier | group
+        #   format          ::= '.' identifier
+        #   filter          ::= '?' test
+        head_token = tokens.pop(SymbolToken, ['/'])
+        base = None
+        selector = None
+        filter = None
+        segment = None
+        format = None
+        if tokens.peek(NameToken):
+            base = IdentifierParser << tokens
+        elif tokens.peek(SymbolToken, ['(']):
+            base = GroupParser << tokens
+        if tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+        if base is not None or selector is not None:
+            if tokens.peek(SymbolToken, ['.'], do_pop=True):
+                identifier = IdentifierParser << tokens
+                format = FormatSyntax(identifier, identifier.mark)
+            if tokens.peek(SymbolToken, ['?'], do_pop=True):
+                filter = TestParser << tokens
+            mark = Mark.union(base, selector, filter)
+            segment = SegmentSyntax(base, selector, filter, mark)
+        mark = Mark.union(head_token, segment, format)
+        query = QuerySyntax(segment, format, mark)
+        return query
+
+
+class ElementParser(Parser):
+    """
+    Parses an `element` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   element         ::= test ( '+' | '-' )*
+        element = TestParser << tokens
+        while tokens.peek(SymbolToken, ['+', '-']):
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol = symbol_token.value
+            mark = Mark.union(element, symbol_token)
+            element = OperatorSyntax(symbol, element, None, mark)
+        return element
+
+
+class TestParser(Parser):
+    """
+    Parses a `test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   test            ::= and_test ( '|' and_test )*
+        test = AndTestParser << tokens
+        while tokens.peek(SymbolToken, ['|']):
+            symbol_token = tokens.pop(SymbolToken, ['|'])
+            symbol = symbol_token.value
+            left = test
+            right = AndTestParser << tokens
+            mark = Mark.union(left, right)
+            test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class AndTestParser(Parser):
+    """
+    Parses an `and_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   and_test        ::= implies_test ( '&' implies_test )*
+        test = ImpliesTestParser << tokens
+        while tokens.peek(SymbolToken, ['&']):
+            symbol_token = tokens.pop(SymbolToken, ['&'])
+            symbol = symbol_token.value
+            left = test
+            right = ImpliesParser << tokens
+            mark = Mark.union(left, right)
+            test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class ImpliesTestParser(Parser):
+    """
+    Parses an `implies_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   implies_test    ::= unary_test ( '->' unary_test )?
+        test = UnaryTestParser << tokens
+        if not tokens.peek(SymbolToken, ['->']):
+            return test
+        symbol_token = tokens.pop(SymbolToken, ['->'])
+        symbol = symbol_token.value
+        left = test
+        right = UnaryTestParser << tokens
+        mark = Mark.union(left, right)
+        test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class UnaryTestParser(Parser):
+    """
+    Parses a `unary_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   unary_test      ::= '!' unary_test | comparison
+        symbol_tokens = []
+        while tokens.peek(SymbolToken, ['!']):
+            symbol_token = tokens.pop(SymbolToken, ['!'])
+            symbol_tokens.append(symbol_token)
+        test = ComparisonParser << tokens
+        while symbol_tokens:
+            symbol_token = symbol_tokens.pop()
+            symbol = symbol_token.value
+            mark = Mark.union(symbol_token, test)
+            test = OperatorSyntax(symbol, None, test, mark)
+        return test
+
+
+class ComparisonParser(Parser):
+    """
+    Parses a `comparison` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   comparison  ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
+        #                                  '$~~' | '$~' | '~~' | '~' |
+        #                                  '!=~~' | '!=~' | '!^~~' | '!^~' |
+        #                                  '!$~~' | '!$~' | '!~~' | '!~' |
+        #                                  '<=' | '<' | '>=' |  '>' |
+        #                                  '==' | '=' | '!==' | '!=' )
+        #                                expression )?
+        expression = ExpressionParser << tokens
+        symbol_token = tokens.peek(SymbolToken,
+                                   ['=~~', '=~', '^~~', '^~',
+                                    '$~~', '$~', '~~', '~',
+                                    '!=~~', '!=~', '!^~~', '!^~',
+                                    '!$~~', '!$~', '!~~', '!~',
+                                    '<=', '<', '>=', '>',
+                                    '==', '=', '!==', '!='], do_pop=True)
+        if symbol_token is None:
+            return expression
+        symbol = symbol_token.value
+        left = expression
+        right = ExpressionParser << tokens
+        mark = Mark.union(left, right)
+        comparison = OperatorSyntax(symbol, left, right, mark)
+        return comparison
+
+
+class ExpressionParser(Parser):
+    """
+    Parses an `expression` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   expression      ::= term ( ( '+' | '-' ) term )*
+        expression = TermParser << tokens
+        # Here we perform a look-ahead to distinguish between productions:
+        #   element         ::= test ( '+' | '-' )*
+        # and
+        #   expression      ::= term ( ( '+' | '-' ) term )*
+        # We know that the FOLLOWS set of `element` consists of the symbols:
+        #   ',', ')', and '}',
+        # which never start the `term` non-terminal.
+        while tokens.peek(SymbolToken, ['+', '-']):
+            ahead = 1
+            while tokens.peek(SymbolToken, ['+', '-'], ahead=ahead):
+                ahead += 1
+            if tokens.peek(SymbolToken, [',', ')', '}'], ahead=ahead):
+                break
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol = symbol_token.value
+            left = expression
+            right = TermParser << tokens
+            mark = Mark.union(left, right)
+            expression = OperatorSyntax(symbol, left, right, mark)
+        return expression
+
+
+class TermParser(Parser):
+    """
+    Parses a `term` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   term            ::= factor ( ( '*' | identifier ) factor )*
+        expression = FactorParser << tokens
+        while (tokens.peek(SymbolToken, ['*'])
+               or tokens.peek(NameToken)):
+            if tokens.peek(SymbolToken, ['*']):
+                symbol_token = token.pop(SymbolToken, ['*'])
+                symbol = token.value
+                left = expression
+                right = FactorParser << tokens
+                mark = Mark.union(left, right)
+                expression = OperatorSyntax(symbol, left, right, mark)
+            else:
+                identifier = IdentifierParser << tokens
+                left = expression
+                right = FactorParser << tokens
+                mark = Mark.union(left, right)
+                expression = FunctionOperatorSyntax(identifier,
+                                                    left, right, mark)
+        return expression
+
+
+class FactorParser(Parser):
+    """
+    Parses a `factor` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   factor          ::= ( '+' | '-' ) factor | power
+        symbol_tokens = []
+        while tokens.peek(SymbolToken, ['+', '-']):
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol_tokens.append(symbol_token)
+        expression = PowerParser << tokens
+        while symbol_tokens:
+            symbol_token = symbol_tokens.pop()
+            symbol = symbol_token.value
+            mark = Mark.union(symbol_token, test)
+            expression = OperatorSyntax(symbol, None, expression, mark)
+        return expression
+
+
+class PowerParser(Parser):
+    """
+    Parses a `power` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   power           ::= sieve ( '^' power )?
+        expression = SieveParser << tokens
+        # Note that there is some grammar ambiguity here: if the sieve
+        # contains a filter part, the sieve parser will greedily consume
+        # any `^` expression.
+        if not tokens.peek(SymbolToken, ['^']):
+            return expression
+        symbol_token = tokens.pop(SymbolToken, ['^'])
+        symbol = symbol_token.value
+        left = expression
+        right = PowerParser << tokens
+        mark = Mark.union(left, right)
+        expression = OperatorSyntax(symbol, None, expression, mark)
+        return expression
+
+
+class SieveParser(Parser):
+    """
+    Parses a `sieve` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   sieve           ::= specifier selector? filter?
+        expression = SpecifierParser << tokens
+        selector = None
+        filter = None
+        if tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+        if tokens.peek(SymbolToken, ['?']):
+            tokens.pop(SymbolToken, ['?'])
+            filter = TestParser << tokens
+        if selector is None and filter is None:
+            return expression
+        mark = Mark.union(expression, selector, filter)
+        expression = SieveSyntax(expression, selector, filter, mark)
+        return expression
+
+
+class SpecifierParser(Parser):
+    """
+    Parses a `specifier` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
+        #   call            ::= '(' elements? ')'
+        #   elements        ::= element ( ',' element )* ','?
+        expression = AtomParser << tokens
+        while tokens.peek(SymbolToken, ['.']):
+            head_token = tokens.pop(SymbolToken, ['.'])
+            if tokens.peek(SymbolToken, ['*']):
+                symbol_token = tokens.pop(SymbolToken, ['*'])
+                wildcard = WildcardSyntax(symbol_token.mark)
+                mark = Mark.union(head_token, wildcard)
+                expression = SpecifierSyntax(expression, wildcard, mark)
+                break
+            else:
+                identifier = IdentifierParser << tokens
+                if tokens.peek(SymbolToken, ['(']):
+                    tokens.pop(SymbolToken, ['('])
+                    arguments = []
+                    while not tokens.peek(SymbolToken, [')']):
+                        argument = ElementParser << tokens
+                        arguments.append(argument)
+                        if not tokens.peek(SymbolToken, [')']):
+                            tokens.pop(SymbolToken, [',', ')'])
+                    tail_token = tokens.pop(SymbolToken, [')'])
+                    mark = Mark.union(head_token, tail_token)
+                    expression = FunctionCallSyntax(expression, identifier,
+                                                    arguments, mark)
+                else:
+                    mark = Mark.union(head_token, identifier)
+                    expression = SpecifierSyntax(expression, identifier, mark)
+        return expression
+
+
+class AtomParser(Parser):
+    """
+    Parses an `atom` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   atom        ::= '*' | selector | group | identifier call? | literal
+        #   call        ::= '(' elements? ')'
+        #   elements    ::= element ( ',' element )* ','?
+        #   literal     ::= STRING | NUMBER
+        if tokens.peek(SymbolToken, ['*']):
+            symbol_token = tokens.pop(SymbolToken, ['*'])
+            wildcard = WildcardSyntax(symbol_token.mark)
+            return wildcard
+        elif tokens.peek(SymbolToken, ['(']):
+            group = GroupParser << tokens
+            return group
+        elif tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+            return selector
+        elif tokens.peek(NameToken):
+            identifier = IdentifierParser << tokens
+            if tokens.peek(SymbolToken, ['(']):
+                tokens.pop(SymbolToken, ['('])
+                arguments = []
+                while not tokens.peek(SymbolToken, [')']):
+                    argument = ElementParser << tokens
+                    arguments.append(argument)
+                    if not tokens.peek(SymbolToken, [')']):
+                        tokens.pop(SymbolToken, [',', ')'])
+                tail_token = tokens.pop(SymbolToken, [')'])
+                mark = Mark.union(identifier, tail_token)
+                expression = FunctionCallSyntax(None, identifier,
+                                                arguments, mark)
+                return expression
+            else:
+                return identifier
+        elif tokens.peek(StringToken):
+            token = tokens.pop(StringToken)
+            return StringSyntax(token.value, token.mark)
+        elif tokens.peek(NumberToken):
+            token = tokens.pop(NumberToken)
+            return NumberSyntax(token.value, token.mark)
+        # We expect it to always produce an error message.
+        tokens.pop(NameToken)
+        # Not reachable.
+        assert False
+
+
+class GroupParser(Parser):
+    """
+    Parses a `group` production.
+    """
+
+    @classmethod
+    def process(self, tokens):
+        # Parses the production:
+        #   group           ::= '(' element ')'
+        head_token = tokens.pop(SymbolToken, ['('])
+        expression = ElementParser << tokens
+        tail_token = tokens.pop(SymbolToken, [')'])
+        mark = Mark.union(head_token, tail_token)
+        group = GroupSyntax(expression, mark)
+        return group
+
+
+class SelectorParser(Parser):
+    """
+    Parses a `selector` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   selector        ::= '{' elements? '}'
+        #   elements        ::= element ( ',' element )* ','?
+        head_token = tokens.pop(SymbolToken, ['{'])