Commits

Kirill Simonov committed c45e8a3

Implemented the HTSQL parser.

The following new modules added:
`htsql.mark` defines a slice of an HTSQL query used for error reporting.
`htsql.error` defines HTSQL exceptions.
`htsql.token` defines tokens produced by the HTSQL scanner.
`htsql.scanner` implements the HTSQL scanner and a token stream.
`htsql.syntax` defines syntax nodes.
`htsql.parser` implements the HTSQL parser.

  • Participants
  • Parent commits 80c019d

Comments (0)

Files changed (10)

 
 .. automodule:: htsql.ctl.routine
 
-.. autoclass:: Argument(attribute, validator, default=ARGUMENT_REQUIRED, is_list=False, hint=None)
+.. autoclass:: Argument(attribute, validator, default=MANDATORY_ARGUMENT, is_list=False, hint=None)
    :members: get_hint, get_signature
 
 .. autoclass:: Routine

File src/htsql/application.py

 #
 
 
+"""
+This module implements an HTSQL application.
+"""
+
+
+from .error import HTTPError
 from .util import DB
+from .parser import QueryParser
+import urllib
 
 
 class Application(object):
+    """
+    Implements an HTSQL application.
+
+    `db`
+        The connection URI.
+    """
 
     def __init__(self, db):
         self.db = DB.parse(db)
 
     def __call__(self, environ, start_response):
+        """
+        Implements the WSGI entry point.
+        """
+        # Parse and echo the query.
         path_info = environ['PATH_INFO']
-        if path_info == '/':
-            start_response("200 OK", [('Content-Type', 'text/plain')])
-            return ["Hello World!\n"]
-        else:
-            start_response("404 Not Found", [('Content-Type', 'text/plain')])
-            return ["File not found: %s\n" % path_info]
+        query_string = environ.get('QUERY_STRING')
+        uri = urllib.quote(path_info)
+        if query_string:
+            uri += '?'+query_string
+        parser = QueryParser(uri)
+        try:
+            syntax = parser.parse()
+        except HTTPError, exc:
+            return exc(environ, start_response)
+        start_response("200 OK", [('Content-Type', 'text/plain')])
+        return [str(syntax), "\n"]
 
 

File src/htsql/error.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module implements HTSQL exceptions.
+"""
+
+
+from .mark import Mark
+
+
+class HTTPError(Exception):
+    """
+    An error associated with an HTSQL query.
+
+    An instance of :class:`HTTPError` contains a pointer to an HTSQL
+    expression that caused the error.  The traceback produced by the exception
+    includes the original HTSQL query and a pointer to the erroneous
+    expression.
+
+    An instance of :class:`HTTPError` could also serve as a simple WSGI
+    application generating an appropriate HTTP error code and displaying
+    the error message.
+
+    This is an abstract exception class.  To implement a concrete exception,
+    add a subclass of :class:`HTTPError` and override the following class
+    attributes:
+
+    `code`
+        The HTTP status line.
+
+    `kind`
+        The description of the error class.
+
+    The constructor of :class:`HTTPError` accepts the following parameters:
+
+    `detail` (a string)
+        The description of the error.
+
+    `mark` (class:`htsql.mark.Mark`)
+        The slice of an HTSQL query that caused the error.
+    """
+
+    code = None
+    kind = None
+
+    def __init__(self, detail, mark):
+        assert isinstance(detail, str)
+        assert isinstance(mark, Mark)
+
+        self.detail = detail
+        self.mark = mark
+
+    def __call__(self, environ, start_response):
+        """
+        Implements the WSGI entry point.
+        """
+        start_response(self.code,
+                       [('Content-Type', 'text/plain; charset=UTF-8')])
+        return [str(self), "\n"]
+
+    def __str__(self):
+        mark_detail = "    %s\n    %s" % (self.mark.input, self.mark.pointer())
+        return "%s: %s:\n%s" % (self.kind, self.detail, mark_detail)
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__.__name__, self.detail)
+
+
+#
+# HTTP Errors.
+#
+
+
+class BadRequestError(HTTPError):
+    """
+    Represents ``400 Bad Request``.
+    """
+
+    code = "400 Bad Request"
+    kind = "invalid request"
+
+
+class ForbiddenError(HTTPError):
+    """
+    Represents ``403 Forbidden``.
+    """
+
+    code = "403 Forbidden"
+    kind = "the request is denied for security reasons"
+
+
+class NotFoundError(HTTPError):
+    """
+    Represents ``404 Not Found``.
+    """
+
+    code = "404 Not Found"
+    kind = "resource not found"
+
+
+class ConflictError(HTTPError):
+    """
+    Represents ``409 Conflict``.
+    """
+
+    code = "409 Conflict"
+    kind = "conflict"
+
+
+class InternalServerError(HTTPError):
+    """
+    Represents ``500 Internal Server Error``.
+    """
+
+    code = "500 Internal Server Error"
+    kind = "implementation error"
+
+
+class NotImplementedError(HTTPError):
+    """
+    Represents ``501 Not Implemented``.
+    """
+
+    code = "501 Not Implemented"
+    kind = "not implemented"
+
+
+#
+# Concrete HTSQL errors.
+#
+
+
+class InvalidSyntaxError(BadRequestError):
+    """
+    Represents an invalid syntax error.
+
+    This exception is raised by the scanner when it cannot tokenize the query,
+    or by the parser when it finds an unexpected token.
+    """
+
+    kind = "invalid syntax"
+
+

File src/htsql/mark.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module implements a :class:`Mark` object.
+"""
+
+
+class Mark(object):
+    """
+    A slice of an HTSQL query.
+
+    In the process of translation of an HTSQL query to SQL, intermediary nodes
+    of different kinds are generated.  Most of the nodes have a `mark`
+    attribute that points to the original HTSQL expression from which the
+    node was derived.
+
+    Note that a :class:`Mark` object should only be used for presentational
+    purposes such as error reporting.  There is no guarantee that the slice
+    ``mark.value[mark.start:mark.end]`` would represent a valid HTSQL
+    expression.
+
+    `input` (a string)
+        The HTSQL query.
+
+    `start` (an integer)
+        The beginning of the slice.
+
+    `end` (an integer)
+        The end of the slice.
+    """
+
+    @classmethod
+    def union(cls, *nodes):
+        """
+        Generates a new :class:`Mark` object from a collection of separate
+        marks.  The returned mark will cover all the given marks.
+
+        `nodes` (a list)
+            A list of objects having a `mark` attribute.  The list may
+            also include ``None``.
+        """
+        # Get a list of `Mark` objects; if no marks are given, return `None`.
+        marks = [node.mark for node in nodes if node is not None]
+        if not marks:
+            return None
+        # It might happen that there is more than one query associated with
+        # the given marks.  In this case, there is no meaningful way to
+        # construct a union from all the marks.  We deal with it by selecting
+        # marks associated with one of the queries while ignoring all the
+        # others.
+        input = marks[0].input
+        marks = [mark for mark in marks if mark.input is input]
+        # Find the boundaries of the slice and instantiate the mark.
+        start = min(mark.start for mark in marks)
+        end = max(mark.end for mark in marks)
+        return cls(input, start, end)
+
+    def __init__(self, input, start, end):
+        # Sanity check on the arguments.
+        assert isinstance(input, str)
+        assert isinstance(start, int)
+        assert isinstance(end, int)
+        assert 0 <= start <= end <= len(input)
+
+        self.input = input
+        self.start = start
+        self.end = end
+
+    def pointer(self):
+        """
+        Returns a string consisting of space and ``^`` characters that
+        could be used for underlining the mark in the original query.
+
+        For instance, assume the query is::
+
+            /{2+2}
+
+        and the mark points to the expression ``2+2``.  In this case,
+        the method will produce the string::
+
+            '  ^^^'
+
+        If we print the query and the pointer on two subsequent lines,
+        we get::
+
+            /{2+2}
+              ^^^
+        """
+        # For the pointer to be displayed properly, the lengths of
+        # the indent and of the underline should be measured in Unicode
+        # characters.
+        try:
+            indent_length = len(self.input[:self.start].decode('utf-8'))
+            line_length = len(self.input[self.start:self.end].decode('utf-8'))
+        except UnicodeDecodeError:
+            # It might happen that the query is not UTF-8 encoded, in
+            # which case we could only approximate.
+            indent_length = self.start
+            line_length = self.end-self.start
+        pointer = ' '*indent_length + '^'*max(line_length, 1)
+        return pointer
+
+    def __str__(self):
+        return "%r >>> %r <<< %r" % (self.input[:self.start],
+                                     self.input[self.start:self.end],
+                                     self.input[self.end:])
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__.__name__, self)
+
+

File src/htsql/parser.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module implements the HTSQL parser.
+
+The parser takes a stream of tokens from the scanner and produces
+a syntax tree.
+
+Here is the grammar of HTSQL::
+
+    stream          ::= query END
+
+    query           ::= '/'
+                      | '/' base format? filter?
+                      | '/' base? selector format? filter?
+    base            ::= identifier | group
+    format          ::= '.' identifier
+    filter          ::= '?' test
+
+    element         ::= test ( '+' | '-' )*
+    test            ::= and_test ( '|' and_test )*
+    and_test        ::= implies_test ( '&' implies_test )*
+    implies_test    ::= unary_test ( '->' unary_test )?
+    unary_test      ::= '!' unary_test | comparison
+
+    comparison      ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
+                                       '$~~' | '$~' | '~~' | '~' |
+                                       '!=~~' | '!=~' | '!^~~' | '!^~' |
+                                       '!$~~' | '!$~' | '!~~' | '!~' |
+                                       '<=' | '<' | '>=' |  '>' |
+                                       '==' | '=' | '!==' | '!=' )
+                                     expression )?
+
+    expression      ::= term ( ( '+' | '-' ) term )*
+    term            ::= factor ( ( '*' | identifier ) factor )*
+    factor          ::= ( '+' | '-' ) factor | power
+    power           ::= sieve ( '^' power )?
+
+    sieve           ::= specifier selector? filter?
+    specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
+    atom            ::= '*' | selector | group | identifier call? | literal
+
+    group           ::= '(' element ')'
+    call            ::= '(' elements? ')'
+    selector        ::= '{' elements? '}'
+    elements        ::= element ( ',' element )* ','?
+
+    identifier      ::= NAME
+    literal         ::= STRING | NUMBER
+
+Note that this grammar is almost LL(1); one notable exception is
+the postfix ``+`` and ``-`` operators.
+"""
+
+
+from .mark import Mark
+from .scanner import Scanner
+from .token import NameToken, StringToken, NumberToken, SymbolToken, EndToken
+from .syntax import (QuerySyntax, SegmentSyntax, FormatSyntax, SelectorSyntax,
+                     SieveSyntax, OperatorSyntax, FunctionOperatorSyntax,
+                     FunctionCallSyntax, GroupSyntax, SpecifierSyntax,
+                     IdentifierSyntax, WildcardSyntax,
+                     StringSyntax, NumberSyntax)
+
+
+class Parser(object):
+    """
+    Implements an HTSQL parser.
+
+    This is an abstract class; see subclasses of :class:`Parser` for
+    implementations of various parts of the HTSQL grammar.
+
+    `input` (a string)
+        The input HTSQL expression.
+    """
+
+    class __metaclass__(type):
+        # Implements a shortcut:
+        #   Parser << tokens
+        # to call
+        #   Parser.process(tokens)
+
+        def __lshift__(parser_class, tokens):
+            return parser_class.process(tokens)
+
+    def __init__(self, input):
+        assert isinstance(input, str)
+        self.input = input
+
+    def parse(self):
+        """
+        Parses the input expression; returns the corresponding syntax node.
+        """
+        # Tokenize the input query.
+        scanner = Scanner(self.input)
+        tokens = scanner.scan()
+        # Parse the input query.
+        syntax = self.process(tokens)
+        # Ensure that we reached the end of the token stream.
+        tokens.pop(EndToken)
+        return syntax
+
+    @classmethod
+    def process(cls, tokens):
+        """
+        Takes a stream of tokens; returns a syntax node.
+
+        This function does not have to exhaust the token stream.
+
+        `tokens` (:class:`htsql.scanner.TokenStream`)
+            The stream of tokens to parse.
+        """
+        # Override in subclasses.
+        raise NotImplementedError()
+
+
+class QueryParser(Parser):
+    """
+    Parses an HTSQL query.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parse the productions:
+        #   query           ::= '/'
+        #                     | '/' base format? filter?
+        #                     | '/' base? selector format? filter?
+        #   base            ::= identifier | group
+        #   format          ::= '.' identifier
+        #   filter          ::= '?' test
+        head_token = tokens.pop(SymbolToken, ['/'])
+        base = None
+        selector = None
+        filter = None
+        segment = None
+        format = None
+        if tokens.peek(NameToken):
+            base = IdentifierParser << tokens
+        elif tokens.peek(SymbolToken, ['(']):
+            base = GroupParser << tokens
+        if tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+        if base is not None or selector is not None:
+            if tokens.peek(SymbolToken, ['.'], do_pop=True):
+                identifier = IdentifierParser << tokens
+                format = FormatSyntax(identifier, identifier.mark)
+            if tokens.peek(SymbolToken, ['?'], do_pop=True):
+                filter = TestParser << tokens
+            mark = Mark.union(base, selector, filter)
+            segment = SegmentSyntax(base, selector, filter, mark)
+        mark = Mark.union(head_token, segment, format)
+        query = QuerySyntax(segment, format, mark)
+        return query
+
+
+class ElementParser(Parser):
+    """
+    Parses an `element` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   element         ::= test ( '+' | '-' )*
+        element = TestParser << tokens
+        while tokens.peek(SymbolToken, ['+', '-']):
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol = symbol_token.value
+            mark = Mark.union(element, symbol_token)
+            element = OperatorSyntax(symbol, element, None, mark)
+        return element
+
+
+class TestParser(Parser):
+    """
+    Parses a `test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   test            ::= and_test ( '|' and_test )*
+        test = AndTestParser << tokens
+        while tokens.peek(SymbolToken, ['|']):
+            symbol_token = tokens.pop(SymbolToken, ['|'])
+            symbol = symbol_token.value
+            left = test
+            right = AndTestParser << tokens
+            mark = Mark.union(left, right)
+            test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class AndTestParser(Parser):
+    """
+    Parses an `and_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   and_test        ::= implies_test ( '&' implies_test )*
+        test = ImpliesTestParser << tokens
+        while tokens.peek(SymbolToken, ['&']):
+            symbol_token = tokens.pop(SymbolToken, ['&'])
+            symbol = symbol_token.value
+            left = test
+            right = ImpliesParser << tokens
+            mark = Mark.union(left, right)
+            test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class ImpliesTestParser(Parser):
+    """
+    Parses an `implies_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   implies_test    ::= unary_test ( '->' unary_test )?
+        test = UnaryTestParser << tokens
+        if not tokens.peek(SymbolToken, ['->']):
+            return test
+        symbol_token = tokens.pop(SymbolToken, ['->'])
+        symbol = symbol_token.value
+        left = test
+        right = UnaryTestParser << tokens
+        mark = Mark.union(left, right)
+        test = OperatorSyntax(symbol, left, right, mark)
+        return test
+
+
+class UnaryTestParser(Parser):
+    """
+    Parses a `unary_test` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   unary_test      ::= '!' unary_test | comparison
+        symbol_tokens = []
+        while tokens.peek(SymbolToken, ['!']):
+            symbol_token = tokens.pop(SymbolToken, ['!'])
+            symbol_tokens.append(symbol_token)
+        test = ComparisonParser << tokens
+        while symbol_tokens:
+            symbol_token = symbol_tokens.pop()
+            symbol = symbol_token.value
+            mark = Mark.union(symbol_token, test)
+            test = OperatorSyntax(symbol, None, test, mark)
+        return test
+
+
+class ComparisonParser(Parser):
+    """
+    Parses a `comparison` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   comparison  ::= expression ( ( '=~~' | '=~' | '^~~' | '^~' |
+        #                                  '$~~' | '$~' | '~~' | '~' |
+        #                                  '!=~~' | '!=~' | '!^~~' | '!^~' |
+        #                                  '!$~~' | '!$~' | '!~~' | '!~' |
+        #                                  '<=' | '<' | '>=' |  '>' |
+        #                                  '==' | '=' | '!==' | '!=' )
+        #                                expression )?
+        expression = ExpressionParser << tokens
+        symbol_token = tokens.peek(SymbolToken,
+                                   ['=~~', '=~', '^~~', '^~',
+                                    '$~~', '$~', '~~', '~',
+                                    '!=~~', '!=~', '!^~~', '!^~',
+                                    '!$~~', '!$~', '!~~', '!~',
+                                    '<=', '<', '>=', '>',
+                                    '==', '=', '!==', '!='], do_pop=True)
+        if symbol_token is None:
+            return expression
+        symbol = symbol_token.value
+        left = expression
+        right = ExpressionParser << tokens
+        mark = Mark.union(left, right)
+        comparison = OperatorSyntax(symbol, left, right, mark)
+        return comparison
+
+
+class ExpressionParser(Parser):
+    """
+    Parses an `expression` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   expression      ::= term ( ( '+' | '-' ) term )*
+        expression = TermParser << tokens
+        # Here we perform a look-ahead to distinguish between productions:
+        #   element         ::= test ( '+' | '-' )*
+        # and
+        #   expression      ::= term ( ( '+' | '-' ) term )*
+        # We know that the FOLLOWS set of `element` consists of the symbols:
+        #   ',', ')', and '}',
+        # which never start the `term` non-terminal.
+        while tokens.peek(SymbolToken, ['+', '-']):
+            ahead = 1
+            while tokens.peek(SymbolToken, ['+', '-'], ahead=ahead):
+                ahead += 1
+            if tokens.peek(SymbolToken, [',', ')', '}'], ahead=ahead):
+                break
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol = symbol_token.value
+            left = expression
+            right = TermParser << tokens
+            mark = Mark.union(left, right)
+            expression = OperatorSyntax(symbol, left, right, mark)
+        return expression
+
+
+class TermParser(Parser):
+    """
+    Parses a `term` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   term            ::= factor ( ( '*' | identifier ) factor )*
+        expression = FactorParser << tokens
+        while (tokens.peek(SymbolToken, ['*'])
+               or tokens.peek(NameToken)):
+            if tokens.peek(SymbolToken, ['*']):
+                symbol_token = token.pop(SymbolToken, ['*'])
+                symbol = token.value
+                left = expression
+                right = FactorParser << tokens
+                mark = Mark.union(left, right)
+                expression = OperatorSyntax(symbol, left, right, mark)
+            else:
+                identifier = IdentifierParser << tokens
+                left = expression
+                right = FactorParser << tokens
+                mark = Mark.union(left, right)
+                expression = FunctionOperatorSyntax(identifier,
+                                                    left, right, mark)
+        return expression
+
+
+class FactorParser(Parser):
+    """
+    Parses a `factor` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   factor          ::= ( '+' | '-' ) factor | power
+        symbol_tokens = []
+        while tokens.peek(SymbolToken, ['+', '-']):
+            symbol_token = tokens.pop(SymbolToken, ['+', '-'])
+            symbol_tokens.append(symbol_token)
+        expression = PowerParser << tokens
+        while symbol_tokens:
+            symbol_token = symbol_tokens.pop()
+            symbol = symbol_token.value
+            mark = Mark.union(symbol_token, test)
+            expression = OperatorSyntax(symbol, None, expression, mark)
+        return expression
+
+
+class PowerParser(Parser):
+    """
+    Parses a `power` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   power           ::= sieve ( '^' power )?
+        expression = SieveParser << tokens
+        # Note that there is some grammar ambiguity here: if the sieve
+        # contains a filter part, the sieve parser will greedily consume
+        # any `^` expression.
+        if not tokens.peek(SymbolToken, ['^']):
+            return expression
+        symbol_token = tokens.pop(SymbolToken, ['^'])
+        symbol = symbol_token.value
+        left = expression
+        right = PowerParser << tokens
+        mark = Mark.union(left, right)
+        expression = OperatorSyntax(symbol, None, expression, mark)
+        return expression
+
+
+class SieveParser(Parser):
+    """
+    Parses a `sieve` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   sieve           ::= specifier selector? filter?
+        expression = SpecifierParser << tokens
+        selector = None
+        filter = None
+        if tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+        if tokens.peek(SymbolToken, ['?']):
+            tokens.pop(SymbolToken, ['?'])
+            filter = TestParser << tokens
+        if selector is None and filter is None:
+            return expression
+        mark = Mark.union(expression, selector, filter)
+        expression = SieveSyntax(expression, selector, filter, mark)
+        return expression
+
+
+class SpecifierParser(Parser):
+    """
+    Parses a `specifier` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   specifier       ::= atom ( '.' identifier call? )* ( '.' '*' )?
+        #   call            ::= '(' elements? ')'
+        #   elements        ::= element ( ',' element )* ','?
+        expression = AtomParser << tokens
+        while tokens.peek(SymbolToken, ['.']):
+            head_token = tokens.pop(SymbolToken, ['.'])
+            if tokens.peek(SymbolToken, ['*']):
+                symbol_token = tokens.pop(SymbolToken, ['*'])
+                wildcard = WildcardSyntax(symbol_token.mark)
+                mark = Mark.union(head_token, wildcard)
+                expression = SpecifierSyntax(expression, wildcard, mark)
+                break
+            else:
+                identifier = IdentifierParser << tokens
+                if tokens.peek(SymbolToken, ['(']):
+                    tokens.pop(SymbolToken, ['('])
+                    arguments = []
+                    while not tokens.peek(SymbolToken, [')']):
+                        argument = ElementParser << tokens
+                        arguments.append(argument)
+                        if not tokens.peek(SymbolToken, [')']):
+                            tokens.pop(SymbolToken, [',', ')'])
+                    tail_token = tokens.pop(SymbolToken, [')'])
+                    mark = Mark.union(head_token, tail_token)
+                    expression = FunctionCallSyntax(expression, identifier,
+                                                    arguments, mark)
+                else:
+                    mark = Mark.union(head_token, identifier)
+                    expression = SpecifierSyntax(expression, identifier, mark)
+        return expression
+
+
+class AtomParser(Parser):
+    """
+    Parses an `atom` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   atom        ::= '*' | selector | group | identifier call? | literal
+        #   call        ::= '(' elements? ')'
+        #   elements    ::= element ( ',' element )* ','?
+        #   literal     ::= STRING | NUMBER
+        if tokens.peek(SymbolToken, ['*']):
+            symbol_token = tokens.pop(SymbolToken, ['*'])
+            wildcard = WildcardSyntax(symbol_token.mark)
+            return wildcard
+        elif tokens.peek(SymbolToken, ['(']):
+            group = GroupParser << tokens
+            return group
+        elif tokens.peek(SymbolToken, ['{']):
+            selector = SelectorParser << tokens
+            return selector
+        elif tokens.peek(NameToken):
+            identifier = IdentifierParser << tokens
+            if tokens.peek(SymbolToken, ['(']):
+                tokens.pop(SymbolToken, ['('])
+                arguments = []
+                while not tokens.peek(SymbolToken, [')']):
+                    argument = ElementParser << tokens
+                    arguments.append(argument)
+                    if not tokens.peek(SymbolToken, [')']):
+                        tokens.pop(SymbolToken, [',', ')'])
+                tail_token = tokens.pop(SymbolToken, [')'])
+                mark = Mark.union(identifier, tail_token)
+                expression = FunctionCallSyntax(None, identifier,
+                                                arguments, mark)
+                return expression
+            else:
+                return identifier
+        elif tokens.peek(StringToken):
+            token = tokens.pop(StringToken)
+            return StringSyntax(token.value, token.mark)
+        elif tokens.peek(NumberToken):
+            token = tokens.pop(NumberToken)
+            return NumberSyntax(token.value, token.mark)
+        # We expect it to always produce an error message.
+        tokens.pop(NameToken)
+        # Not reachable.
+        assert False
+
+
+class GroupParser(Parser):
+    """
+    Parses a `group` production.
+    """
+
+    @classmethod
+    def process(self, tokens):
+        # Parses the production:
+        #   group           ::= '(' element ')'
+        head_token = tokens.pop(SymbolToken, ['('])
+        expression = ElementParser << tokens
+        tail_token = tokens.pop(SymbolToken, [')'])
+        mark = Mark.union(head_token, tail_token)
+        group = GroupSyntax(expression, mark)
+        return group
+
+
+class SelectorParser(Parser):
+    """
+    Parses a `selector` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the productions:
+        #   selector        ::= '{' elements? '}'
+        #   elements        ::= element ( ',' element )* ','?
+        head_token = tokens.pop(SymbolToken, ['{'])
+        elements = []
+        while not tokens.peek(SymbolToken, ['}']):
+            element = ElementParser << tokens
+            elements.append(element)
+            if not tokens.peek(SymbolToken, ['}']):
+                # We know it's not going to be '}', but we put it into the list
+                # of accepted values to generate a better error message.
+                tokens.pop(SymbolToken, [',', '}'])
+        tail_token = tokens.pop(SymbolToken, ['}'])
+        mark = Mark.union(head_token, tail_token)
+        selector = SelectorSyntax(elements, mark)
+        return selector
+
+
+class IdentifierParser(Parser):
+    """
+    Parser an `identifier` production.
+    """
+
+    @classmethod
+    def process(cls, tokens):
+        # Parses the production:
+        #   identifier      ::= NAME
+        name_token = tokens.pop(NameToken)
+        identifier = IdentifierSyntax(name_token.value, name_token.mark)
+        return identifier
+
+

File src/htsql/scanner.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module implements the HTSQL scanner.
+
+The scanner tokenizes the input query and produces a stream of tokens.
+
+The first step of scanning is decoding ``%``-escape sequences.  Then
+the scanner splits the input query to a list of tokens.  The following
+token types are emitted:
+
+*NAME*
+    Represents an HTSQL identifier: a sequence of alphanumeric characters
+    that does not start with a digit.  Alphanumeric characters include
+    characters ``a``-``z``, ``A``-``Z``, ``0``-``9``, ``_``, and those
+    Unicode characters that are classified as alphanumeric.
+
+*NUMBER*
+    Represents a number literal: integer, float and scientific notations
+    are recognized.
+
+*STRING*
+    Represents a string literal enclosed in single quotes; any single
+    quote character should be doubled.
+
+*SYMBOL*
+    Represents a valid symbol in HTSQL grammar; one of the following
+    symbols:
+
+    *comparison operators*
+        ``=``, ``!=``, ``==``, ``!==``, ``~``, ``!~``,
+        ``~~``, ``!~~``, ``^~``, ``!^~``, ``^~~``, ``!^~~``,
+        ``$~``, ``!$~``, ``$~~``, ``!$~~``, ``=~``, ``!=~``,
+        ``=~~``, ``!=~~``. ``<``, ``<=``, ``>``, ``>=``.
+
+    *logical operators*
+        ``!``, ``&``, ``|``, ``->``, ``?``.
+
+    *arithmetic operators*
+        ``+``, ``-``, ``*``, ``^``.
+
+    *punctuation*
+        ``(``, ``)``, ``[``, ``]``, ``{``, ``}``,
+        ``.``, ``,``, ``/``.
+
+There are also two special token types:
+
+*WHITESPACE*
+    Represents a whitespace character, one of: space, tab, LF, CR, FF, VT,
+    and those Unicode characters that are classified as space.  The
+    whitespace token are immediately discarded by the scanner and never
+    emitted.
+
+*END*
+    Indicates the end of the query; it is always the last token emitted.
+"""
+
+
+from token import (Token, SpaceToken, NameToken, StringToken, NumberToken,
+                   SymbolToken, EndToken)
+from mark import Mark
+from error import InvalidSyntaxError
+from util import maybe, listof
+import re
+
+
+class TokenStream(object):
+    """
+    A sequence of tokens.
+
+    :class:`TokenStream` wraps a list of tokens with a convenient interface
+    for consumption and look-ahead.
+
+    `tokens` (a list of :`class:htsql.token.Token` objects)
+        A list of tokens.
+    """
+
+    def __init__(self, tokens):
+        assert isinstance(tokens, listof(Token))
+
+        # The list of tokens.
+        self.tokens = tokens
+        # The position of the active token.
+        self.idx = 0
+
+    def peek(self, token_class=None, values=None, ahead=0,
+             do_pop=False, do_force=False):
+        """
+        Returns the active token.
+
+        If the parameter `ahead` is non-zero, the method will return
+        the next `ahead` token after the active one.
+
+        When `token_class` is set, the method checks if the token is an
+        instance of the given class.  When `values` is set, the method
+        checks if the token value belongs to the given list of values.
+        If any of the checks fail, the method either raises
+        :exc:`htsql.error.InvaludSyntaxError` or
+        returns ``None`` depending on the value of the `do_force`
+        parameter.
+
+        This method advances the active pointer to the next token if
+        `do_pop` is enabled.
+
+        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
+            If not ``None``, the method checks that the returned token
+            is an instance of `token_class`.
+
+        `values` (a list of strings or ``None``)
+            If not ``None``, the method checks that the value of the
+            returned token belongs to the list.
+
+        `ahead` (an integer)
+            The position of the returned token relative to the active one.
+
+        `do_pop` (Boolean)
+            If set, the method will advance the active pointer.
+
+        `do_force` (Boolean)
+            This flag affects the method behavior when any of the token
+            checks fail.  If set, the method will raise
+            :exc:`htsql.error.InvaludSyntaxError`; otherwise it will return
+            ``None``.
+        """
+        # Sanity check on the arguments.
+        assert token_class is None or issubclass(token_class, Token)
+        assert isinstance(values, maybe(listof(str)))
+        assert token_class is not None or values is None
+        assert isinstance(ahead, int) and ahead >= 0
+        assert self.idx+ahead < len(self.tokens)
+        assert isinstance(do_pop, bool)
+        assert isinstance(do_force, bool)
+
+        # Get the token we are going to return.  It is the responsibility
+        # of the caller to ensure that the index is in the list bounds.
+        token = self.tokens[self.idx+ahead]
+
+        # Indicates if the token passed the given tests.
+        is_expected = True
+        if token_class is not None:
+            # Check the token type.
+            if not isinstance(token, token_class):
+                is_expected = False
+            else:
+                if values is not None:
+                    # Check the token value.
+                    if token.value not in values:
+                        is_expected = False
+
+        # The token failed the checks.
+        if not is_expected:
+            # If `do_force` is not set, return `None`; otherwise generate
+            # an error message of the form:
+            #   expected {token_class.NAME} ('{values[0]}', ...);
+            #   got {token.NAME} '{token.value}'
+            if not do_force:
+                return None
+            expected = "%s" % token_class.name.upper()
+            if values:
+                if len(values) == 1:
+                    expected = "%s %r" % (expected, values[0])
+                else:
+                    expected = "%s (%s)" % (expected,
+                                            ", ".join(repr(value)
+                                                      for value in values))
+            got = "%s %r" % (token.name.upper(), token.value)
+            raise InvalidSyntaxError("expected %s; got %s" % (expected, got),
+                                     token.mark)
+        # Advance the pointer.
+        if do_pop:
+            self.idx += ahead+1
+
+        return token
+
+    def pop(self, token_class=None, values=None):
+        """
+        Returns the active token and advances the pointer to the next token.
+
+        When `token_class` is set, the method checks if the token is an
+        instance of the given class.  When `values` is set, the method
+        checks if the token value belongs to the given list of values.
+        If any of the checks fail, the method raises
+        :exc:`htsql.error.InvaludSyntaxError`.
+
+        `token_class` (a subclass of :class:`htsql.token.Token` or ``None``)
+            If not ``None``, the method checks that the active token
+            is an instance of `token_class`.
+
+        `values` (a list of strings or ``None``)
+            If not ``None``, the method checks that the value of the active
+            token belongs to the list.
+        """
+        return self.peek(token_class, values,
+                         do_pop=True, do_force=True)
+
+
+class Scanner(object):
+    """
+    Implements the HTSQL scanner.
+
+    `input`
+        An HTSQL query.
+    """
+
+    # List of tokens generated by the scanner.
+    tokens = [
+            SpaceToken,
+            NameToken,
+            NumberToken,
+            StringToken,
+            SymbolToken,
+            EndToken,
+    ]
+    # A regular expression that matches an HTSQL token.  The expression
+    # is compiled from individual token patterns on the first instantiation
+    # of the scanner.
+    regexp = None
+
+    # The regular expression to match %-escape sequences.
+    escape_pattern = r"""%(?P<code>[0-9A-Fa-f]{2})?"""
+    escape_regexp = re.compile(escape_pattern)
+
+    def __init__(self, input):
+        assert isinstance(input, str)
+
+        self.input = input
+        self.init_regexp()
+
+    @classmethod
+    def init_regexp(cls):
+        # Compile the regular expression that matches all token types.
+
+        # If it is already done, return.
+        if cls.regexp is not None:
+            return
+
+        # Get patterns for each token type grouped by the class name.
+        token_patterns = []
+        for token_class in cls.tokens:
+            pattern = r"(?P<%s> %s)" % (token_class.name, token_class.pattern)
+            token_patterns.append(pattern)
+
+        # Combine the individual patterns and compile the expression.
+        pattern = r"|".join(token_patterns)
+        cls.regexp = re.compile(pattern, re.X|re.U)
+
+    def unquote(self, data):
+        # Decode %-escape sequences.
+
+        def replace(match):
+            # Two hexdecimal digits that encode a byte value.
+            code = match.group('code')
+            # Complain if we get `%` not followed by two hexdecimal digits.
+            if not code:
+                mark = Mark(match.string, match.begin(), match.end())
+                raise InvalidSyntaxError("invalid escape sequence", mark)
+            # Return the character corresponding to the escape sequence.
+            return chr(int(code, 16))
+
+        # Apply `replace` to all `%`-escape sequences and return the result.
+        data = self.escape_regexp.sub(replace, data)
+        return data
+
+    def scan(self):
+        """
+        Tokenizes the query; returns a :class:`TokenStream` instance.
+
+        In case of syntax errors, raises :exc:`htsql.error.InvalidSyntaxError`.
+        """
+        # Decode %-escape sequences.
+        unquoted_input = self.unquote(self.input)
+        # Since we want the `\w` pattern to match Unicode characters
+        # classified as alphanumeric, we have to convert the input query
+        # to Unicode.
+        try:
+            decoded_input = unquoted_input.decode('utf-8')
+        except UnicodeDecodeError, exc:
+            mark = Mark(unquoted_input, exc.start, exc.end)
+            raise InvalidSyntaxError("cannot decode an UTF-8 character: %s"
+                                     % exc.reason, mark)
+
+        # The beginning of the next token.
+        start = 0
+        # The beginning of the mark slice.  Note that both `start` and
+        # `mark_start` point to the same position in the query string;
+        # however `start` is measured in Unicode characters while
+        # `mark_start` is measured in octets.
+        mark_start = 0
+        # Have we reached the final token?
+        is_end = False
+        # The list of generated tokens.
+        tokens = []
+
+        # Loop till we get the final token.
+        while not is_end:
+            # Match the next token.
+            match = self.regexp.match(decoded_input, start)
+            if match is None:
+                mark = Mark(unquoted_input, mark_start, mark_start+1)
+                raise InvalidSyntaxError("unexpected character %r" %
+                                         decoded_input[start].encode('utf-8'),
+                                         mark)
+
+            # Find the token class that matched the token.
+            for token_class in self.tokens:
+                group = match.group(token_class.name)
+                if group is not None:
+                    break
+            else:
+                # Unreachable.
+                assert False
+
+            # The raw string that matched the token pattern.
+            group = group.encode('utf-8')
+            # Unquote the token value.
+            value = token_class.unquote(group)
+            # The end of the mark slice.
+            mark_end = mark_start+len(group)
+
+            # Skip whitespace tokens; otherwise produce the next token.
+            if not token_class.is_ws:
+                mark = Mark(unquoted_input, mark_start, mark_end)
+                token = token_class(value, mark)
+                tokens.append(token)
+
+            # Check if we reached the final token.
+            if token_class.is_end:
+                is_end = True
+
+            # Advance the pointers to the beginning of the next token.
+            start = match.end()
+            mark_start = mark_end
+
+        return TokenStream(tokens)
+
+

File src/htsql/syntax.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module defines syntax nodes for the HTSQL grammar.
+"""
+
+
+from .mark import Mark
+from .util import maybe, listof
+import re
+
+
+class Syntax(object):
+    """
+    Represents a syntax node.
+
+    `mark` (:class:`htsql.mark.Mark`)
+        The location of the node in the original query.
+    """
+
+    # The pattern to %-escape unsafe characters.
+    escape_pattern = r"[\x00-\x1F%\x7F]"
+    escape_regexp = re.compile(escape_pattern)
+    escape_replace = (lambda m: "%%%02X" % ord(m.group()))
+
+    def __init__(self, mark):
+        assert isinstance(mark, Mark)
+        self.mark = mark
+
+    def __str__(self):
+        """
+        Returns an HTSQL expression that would produce the same syntax node.
+        """
+        # Override when subclassing.
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__.__name__, self)
+
+
+class QuerySyntax(Syntax):
+    """
+    Represents an HTSQL query.
+
+    An HTSQL query consists of a segment expression and a format identifier.
+
+    `segment` (:class:`SegmentSyntax` or ``None``)
+        The segment expression.
+
+    `format` (:class:`FormatSyntax` or ``None``)
+        The query format.
+    """
+
+    def __init__(self, segment, format, mark):
+        assert isinstance(segment, maybe(SegmentSyntax))
+        assert isinstance(format, maybe(FormatSyntax))
+
+        super(QuerySyntax, self).__init__(mark)
+        self.segment = segment
+        self.format = format
+
+    def __str__(self):
+        # Generate an HTSQL query corresponding to the node.
+        # Note that since the format identifier could split the segment
+        # expression between the selector and the filter, we have to
+        # serialize parts of the segment expression separately.
+        chunks = []
+        chunks.append('/')
+        if self.segment is not None:
+            if self.segment.base is not None:
+                chunks.append(str(self.segment.base))
+            if self.segment.selector is not None:
+                chunks.append(str(self.segment.selector))
+            if self.format is not None:
+                chunks.append('.')
+                chunks.append(str(self.format))
+            if self.segment.filter is not None:
+                chunks.append('?')
+                chunks.append(str(self.segment.filter))
+        return ''.join(chunks)
+
+
+class SegmentSyntax(Syntax):
+    """
+    Represents a segment expression.
+
+    A segment expression consists of the base expression, the selector,
+    and the filter::
+
+        /base{selector}?filter
+
+    `base` (:class:`Syntax` or ``None``)
+        The base expression.
+
+    `selector` (:class:`SelectorSyntax` or ``None``)
+        The selector.
+
+    `filter` (:class:`Syntax` or ``None``)
+        The filter expression.
+    """
+
+    def __init__(self, base, selector, filter, mark):
+        assert isinstance(base, maybe(Syntax))
+        assert isinstance(selector, maybe(SelectorSyntax))
+        assert isinstance(filter, maybe(Syntax))
+
+        super(SegmentSyntax, self).__init__(mark)
+        self.base = base
+        self.selector = selector
+        self.filter = filter
+
+    def __str__(self):
+        # Generate an HTSQL expression of the form:
+        #   base{selector}?filter
+        chunks = []
+        if self.base is not None:
+            chunks.append(str(self.base))
+        if self.selector is not None:
+            chunks.append(str(self.selector))
+        if self.filter is not None:
+            chunks.append('?')
+            chunks.append(str(self.filter))
+        return ''.join(chunks)
+
+
+class FormatSyntax(Syntax):
+    """
+    Represents the format identifier.
+
+    `identifier`
+        The format identifier.
+    """
+    # Note that essentially `FormatSyntax` is a wrapper over
+    # `IdentifierSyntax`.  We wrap the format identifier into
+    # a separate node to enable adapting by the node type.
+
+    def __init__(self, identifier, mark):
+        assert isinstance(identifier, IdentifierSyntax)
+
+        super(FormatSyntax, self).__init__(mark)
+        self.identifier = identifier
+
+    def __str__(self):
+        return str(self.identifier)
+
+
+class SelectorSyntax(Syntax):
+    """
+    Represents a selector expression.
+
+    A selector is a sequence of elements::
+
+        {element, ...}
+
+    `elements` (a list of :class:`Syntax`)
+        The list of selector elements.
+    """
+
+    def __init__(self, elements, mark):
+        assert isinstance(elements, listof(Syntax))
+
+        super(SelectorSyntax, self).__init__(mark)
+        self.elements = elements
+
+    def __str__(self):
+        # Generate an expression of the form:
+        #   {element,...}
+        return '{%s}' % ','.join(str(element)
+                                 for element in self.elements)
+
+
+class SieveSyntax(Syntax):
+    """
+    Represents a sieve expression.
+
+    A sieve expression has the same shape as the segment expression.
+    It consists of the base, the selector and the filter::
+
+        base{selector}?filter
+
+    `base` (:class:`Syntax`)
+        The sieve base expression.
+
+    `selector` (:class:`SelectorSyntax` or ``None``)
+        The sieve selector.
+
+    `filter` (:class:`Syntax` or ``None``)
+        The sieve filter expression.
+    """
+
+    def __init__(self, base, selector, filter, mark):
+        assert isinstance(base, Syntax)
+        assert isinstance(selector, maybe(SelectorSyntax))
+        assert isinstance(filter, maybe(Syntax))
+        assert selector is not None or filter is not None
+
+        super(SieveSyntax, self).__init__(mark)
+        self.base = base
+        self.selector = selector
+        self.filter = filter
+
+    def __str__(self):
+        # Generate an expression of the form:
+        #   base{selector}?filter
+        chunks = []
+        chunks.append(str(self.base))
+        if self.selector is not None:
+            chunks.append(str(self.selector))
+        if self.filter is not None:
+            chunks.append('?')
+            chunks.append(str(self.filter))
+        return ''.join(chunks)
+
+
+class OperatorSyntax(Syntax):
+    """
+    Represents an operator expression.
+
+    An operator expression has one of the following forms::
+
+        left <symbol> right
+        left <symbol>
+        <symbol> right
+
+    `symbol` (a string)
+        The operator.
+
+    `left` (:class:`Syntax` or ``None``)
+        The left argument.
+
+    `right` (:class:`Syntax` or ``None``)
+        The right argument.
+    """
+
+    def __init__(self, symbol, left, right, mark):
+        assert isinstance(symbol, str)
+        assert isinstance(left, maybe(Syntax))
+        assert isinstance(right, maybe(Syntax))
+        assert left is not None or right is not None
+        super(OperatorSyntax, self).__init__(mark)
+        self.symbol = symbol
+        self.left = left
+        self.right = right
+
+    def __str__(self):
+        # Generate an expression of the form:
+        #   left<symbol>right
+        chunks = []
+        if self.left is not None:
+            chunks.append(str(self.left))
+        chunks.append(self.symbol)
+        if self.right is not None:
+            chunks.append(str(self.right))
+        return ''.join(chunks)
+
+
+class FunctionOperatorSyntax(Syntax):
+    """
+    Represents a binary function call in the operator form.
+
+    This expression has the form::
+
+        left identifier right
+
+    and is equivalent to the expression::
+
+        identifier(left, right)
+
+    `identifier` (:class:`IdentifierSyntax`)
+        The function name.
+
+    `left` (:class:`Syntax`)
+        The first argument.
+
+    `right` (:class:`Syntax`)
+        The second argument.
+    """
+
+    def __init__(self, identifier, left, right, mark):
+        assert isinstance(identifier, IdentifierSyntax)
+        assert isinstance(left, Syntax)
+        assert isinstance(right, Syntax)
+
+        super(FunctionOperatorSyntax, self).__init__(mark)
+        self.identifier = identifier
+        self.left = left
+        self.right = right
+
+    def __str__(self):
+        # Generate an expression of the form:
+        #   left identifier right
+        chunks = []
+        if self.left is not None:
+            chunks.append(str(self.left))
+            chunks.append(' ')
+        chunks.append(str(self.identifier))
+        if self.right is not None:
+            chunks.append(' ')
+            chunks.append(str(self.right))
+        return ''.join(chunks)
+
+
+class FunctionCallSyntax(Syntax):
+    """
+    Represents a function or a method call.
+
+    This expression has one of the forms::
+
+        identifier(arguments)
+        base.identifier(arguments)
+
+    `base` (:class:`Syntax` or ``None``)
+        The method base.
+
+    `identifier` (:class:`IdentifierSyntax`)
+        The function name.
+
+    `arguments` (a list of :class:`Syntax`)
+        The list of arguments.
+    """
+
+    def __init__(self, base, identifier, arguments, mark):
+        assert isinstance(base, maybe(Syntax))
+        assert isinstance(identifier, IdentifierSyntax)
+        assert isinstance(arguments, listof(Syntax))
+
+        super(FunctionCallSyntax, self).__init__(mark)
+        self.base = base
+        self.identifier = identifier
+        self.arguments = arguments
+
+    def __str__(self):
+        # Generate an expression of the form:
+        #   base.identifier(arguments)
+        chunks = []
+        if self.base is not None:
+            chunks.append(str(self.base))
+            chunks.append('.')
+        chunks.append(str(self.identifier))
+        chunks.append('(%s)' % ','.join(str(argument)
+                                        for argument in self.arguments))
+        return ''.join(chunks)
+
+
+class GroupSyntax(Syntax):
+    """
+    Represents an expression in parentheses.
+
+    `expression` (:class:`Syntax`)
+        The expression.
+    """
+    # We keep the parentheses in the syntax tree to ease the reverse
+    # translation from the syntax tree to an HTSQL query.
+
+    def __init__(self, expression, mark):
+        assert isinstance(expression, Syntax)
+
+        super(GroupSyntax, self).__init__(mark)
+        self.expression = expression
+
+    def __str__(self):
+        return '(%s)' % self.expression
+
+
+class SpecifierSyntax(Syntax):
+    """
+    Represents a specifier expression.
+
+    A specifier expression has one of the forms::
+
+        base.identifier
+        base.*
+
+    `base` (:class:`Syntax`)
+        The specifier base.
+
+    `identifier` (:class:`IdentifierSyntax` or :class:`WildcardSyntax`)
+        The specifier identifier.
+    """
+
+    def __init__(self, base, identifier, mark):
+        assert isinstance(base, Syntax)
+        assert isinstance(identifier, (IdentifierSyntax, WildcardSyntax))
+
+        super(SpecifierSyntax, self).__init__(mark)
+        self.base = base
+        self.identifier = identifier
+
+    def __str__(self):
+        return '%s.%s' % (self.base, self.identifier)
+
+
+class IdentifierSyntax(Syntax):
+    """
+    Represents an identifier.
+
+    `value` (a string)
+        The identifier.
+    """
+
+    def __init__(self, value, mark):
+        assert isinstance(value, str)
+        super(IdentifierSyntax, self).__init__(mark)
+
+        self.value = value
+
+    def __str__(self):
+        return self.value
+
+
+class WildcardSyntax(Syntax):
+    """
+    Represents a wildcard.
+    """
+
+    def __str__(self):
+        return '*'
+
+
+class LiteralSyntax(Syntax):
+    """
+    Represents a literal expression.
+
+    This is an abstract class with subclasses :class:`StringSyntax` and
+    :class:`NumberSyntax`.
+
+    `value` (a string)
+        The value.
+    """
+
+    def __init__(self, value, mark):
+        assert isinstance(value, str)
+        super(LiteralSyntax, self).__init__(mark)
+        self.value = value
+
+
+class StringSyntax(LiteralSyntax):
+    """
+    Represents a string literal.
+    """
+
+    def __str__(self):
+        # Quote and %-encode the value.
+        value = '\'%s\'' % self.value.replace('\'', '\'\'')
+        value = self.escape_regexp.sub(self.escape_replace, value)
+        return value
+
+
+class NumberSyntax(LiteralSyntax):
+    """
+    Represents a number literal.
+    """
+
+    def __str__(self):
+        return self.value
+
+

File src/htsql/token.py

+#
+# Copyright (c) 2006-2010, Prometheus Research, LLC
+# Authors: Clark C. Evans <cce@clarkevans.com>,
+#          Kirill Simonov <xi@resolvent.net>
+#
+
+
+"""
+This module defines token types used by the HTSQL scanner.
+"""
+
+
+from mark import Mark
+from util import maybe
+
+
+class Token(object):
+    """
+    Represents a lexical token.
+
+    This is an abstract class.  To add a concrete token type, create a
+    subclass of :class:`Token` and override the following class attributes:
+
+    `name` (a string)
+        The name of the token category.  Must be a valid identifier since
+        it is used by the scanner as the name of the pattern group.
+
+    `pattern` (a string)
+        The regular expression to match the token (in the verbose format).
+
+    `is_ws` (Boolean)
+        If set, indicates that the token is to be discarded.
+
+    `is_end` (Boolean)
+        If set, forces the scanner to stop the processing.
+
+    When adding a subclass of :class:`Token`, you may also want to override
+    methods :meth:`unquote` and :meth:`quote`.
+
+    The constructor of :class:`Token` accepts the following parameters:
+
+    `value` (a string)
+        The token value.
+
+    `mark` (:class:`htsql.mark.Mark`)
+        The location of the token in the original query.
+    """
+
+    name = None
+    pattern = None
+    is_ws = False
+    is_end = False
+
+    @classmethod
+    def unquote(cls, value):
+        """
+        Converts a raw string that matches the token pattern to a token value.
+        """
+        return value
+
+    @classmethod
+    def quote(cls, value):
+        """
+        Reverses :meth:`unquote`.
+        """
+        return value
+
+    def __init__(self, value, mark):
+        assert isinstance(value, str)
+        assert isinstance(mark, Mark)
+
+        self.value = value
+        self.mark = mark
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return "<%s %r>" % (self.__class__.__name__, str(self))
+
+
+class SpaceToken(Token):
+    """
+    Represents a whitespace token.
+
+    In HTSQL, whitespace characters are space, tab, LF, CR, FF, VT and
+    those Unicode characters that are classified as space.
+
+    Whitespace tokens are discarded by the scanner without passing them
+    to the parser.
+    """
+
+    name = 'whitespace'
+    pattern = r""" \s+ """
+    # Do not pass the token to the scanner.
+    is_ws = True
+
+
+class NameToken(Token):
+    """
+    Represents a name token.
+
+    In HTSQL, a name is a sequence of alphanumeric characters
+    that does not start with a digit.  Alphanumeric characters include
+    characters ``a``-``z``, ``A``-``Z``, ``0``-``9``, ``_``, and those
+    Unicode characters that are classified as alphanumeric.
+    """
+
+    name = 'name'
+    pattern = r""" (?! \d) \w+ """
+
+
+class StringToken(Token):
+    """
+    Represents a string literal token.
+
+    In HTSQL, a string literal is a sequence of arbitrary characters
+    enclosed in single quotes (``'``).  Use a pair of single quote
+    characters (``''``) to represent a single quote in a string.
+    """
+
+    name = 'string'
+    pattern = r""" ' (?: [^'\0] | '')* ' """
+
+    @classmethod
+    def unquote(cls, value):
+        # Strip leading and trailing quotes and replace `''` with `'`.
+        return value[1:-1].replace('\'\'', '\'')
+
+    @classmethod
+    def quote(cls, value):
+        # Replace all occurences of `'` with `''`, enclose the string
+        # in the quotes.
+        return '\'%s\'' % value.replace('\'', '\'\'')
+
+
+class NumberToken(Token):
+    """
+    Represents a number literal token.
+
+    HTSQL supports number literals in integer, float and scientific notations.
+    """
+
+    name = 'number'
+    pattern = r""" (?: \d* \.)? \d+ [eE] [+-]? \d+ | \d* \. \d+ | \d+ \.? """
+
+
+class SymbolToken(Token):
+    """
+    Represents a symbol token.
+
+    HTSQL employs the following groups of symbols:
+
+    *comparison operators*
+        ``=``, ``!=``, ``==``, ``!==``, ``~``, ``!~``,
+        ``~~``, ``!~~``, ``^~``, ``!^~``, ``^~~``, ``!^~~``,
+        ``$~``, ``!$~``, ``$~~``, ``!$~~``, ``=~``, ``!=~``,
+        ``=~~``, ``!=~~``. ``<``, ``<=``, ``>``, ``>=``.
+
+    *logical operators*
+        ``!``, ``&``, ``|``, ``->``, ``?``.
+
+    *arithmetic operators*
+        ``+``, ``-``, ``*``, ``^``.
+
+    *punctuation*
+        ``(``, ``)``, ``[``, ``]``, ``{``, ``}``,
+        ``.``, ``,``, ``/``.
+    """
+
+    name = 'symbol'
+    pattern = r"""
+        =~~ | =~ | \^~~ | \^~ | \$~~ | \$~ | ~~ | ~ |
+        !=~~ | !=~ | !\^~~ | !\^~ | !\$~~ | !\$~ | !~~ | !~ |
+        <= | < | >= | > | == | = | !== | != | ! |
+        & | \| | -> | \. | , | \? | \^ | / | \* | \+ | - |
+        \( | \) | \{ | \} | \[ | \]
+    """
+
+
+class EndToken(Token):
+    """
+    Represents the end token.
+
+    The end token is emitted when the scanner reached the end of the input
+    string and forces the scanner to stop.
+    """
+
+    name = 'end'
+    pattern = r""" $ """
+    is_end = True
+
+

File src/htsql/util.py

     # 'engine://username:password@host:port/database?options'.
     key_chars = r'''[%0-9a-zA-Z_.-]+'''
     value_chars = r'''[%0-9a-zA-Z`~!#$^*()_+\\|\[\]{};'",.<>/-]+'''
-    pattern = re.compile(r'''(?x)
+    pattern = r'''(?x)
         ^
         (?P<engine> %(key_chars)s )
         ://
                 %(key_chars)s = (?: %(value_chars)s )?
                 (?: & %(key_chars)s = (?: %(value_chars)s )? )* )? )?
         $
-    ''' % vars())
+    ''' % vars()
+    regexp = re.compile(pattern)
 
     def __init__(self, engine, username, password, host, port, database,
                  options=None):
 
         # If a string is given, assume it is a connection URI and parse it.
         if isinstance(value, str):
-            match = cls.pattern.search(value)
+            match = cls.regexp.search(value)
             if match is None:
                 raise ValueError("expected a connection URI of the form"
                                  " 'engine://username:password@host:port"
 
     def __str__(self):
         """Generate a connection URI corresponding to the instance."""
+        # The generated URI should only contain ASCII characters because
+        # we want it to translate to Unicode without decoding errors.
         chunks = []
         chunks.append(self.engine)
         chunks.append('://')
         return ''.join(chunks)
 
     def __repr__(self):
-        return "<%s.%s %s>" % (self.__class__.__module__,
-                               self.__class__.__name__, self)
+        return "<%s %s>" % (self.__class__.__name__, self)
 
 
 #

File src/htsql/validator.py

     `item_validator` (:class:`Val`)
         Validator for the sequence elements.
 
+    `length` (an integer or ``None``)
+        If set, check that the length of the sequence is equal to `length`.
+
     `is_nullable` (Boolean)
         If set, ``None`` values are permitted.
     """
     """
     regexp = re.compile(pattern, re.X)
 
-    def __init__(self, item_validator, is_nullable=False):
+    def __init__(self, item_validator, length=None, is_nullable=False):
         # Sanity check on the arguments.
         assert isinstance(item_validator, Val)
+        assert isinstance(length, maybe(int))
         assert isinstance(is_nullable, bool)
 
         self.item_validator = item_validator
+        self.length = length
         self.is_nullable = is_nullable
 
     def get_hint(self):
 
         # By this step, `value` must be converted to a proper list.
         if not isinstance(value, list):
-            raise ValueError("a list is required; got %r" % value)
+            raise ValueError("a list is expected; got %r" % value)
+
+        # Check the length is specified.
+        if self.length is not None:
+            if len(value) != self.length:
+                raise ValueError("a sequence of length %s is expected;"
+                                 " got %r" % (self.length, value))
 
         # Validate and normalize the list elements.
         items = []