Commits

Daniele Varrazzo  committed 86aa214

Added PostgreSQL specific SQL and interactive session lexers

  • Participants
  • Parent commits 8ad6d35

Comments (0)

Files changed (3)

File pygments/lexers/_postgres_builtins.py

+"""
+    pygments.lexers._postgres_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Self-updating data files for PostgreSQL lexer.
+
+    :copyright: Copyright 2011 by Daniele Varrazzo.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+import urllib2
+
+# One man's constant is another man's variable.
+SOURCE_URL = 'https://github.com/postgres/postgres/raw/REL9_0_STABLE'
+KEYWORDS_URL = SOURCE_URL + '/doc/src/sgml/keywords.sgml'
+DATATYPES_URL = SOURCE_URL + '/doc/src/sgml/datatype.sgml'
+
+def update_myself():
+    datatypes = parse_datatypes(fetch(DATATYPES_URL))
+    keywords = parse_keywords(fetch(KEYWORDS_URL))
+    update_consts(__file__, 'DATATYPES', datatypes)
+    update_consts(__file__, 'KEYWORDS', keywords)
+
+def parse_keywords(f):
+    kw = []
+    re_entry = re.compile('\s*<entry><token>([^<]+)</token></entry>')
+    for line in f:
+        m = re_entry.match(line)
+        if m is None:
+            continue
+
+        kw.append(m.group(1))
+
+    kw.sort()
+    return kw
+
+def parse_datatypes(f):
+    dt = set()
+    re_entry = re.compile('\s*<entry><type>([^<]+)</type></entry>')
+    for line in f:
+        if '<sect1' in line:
+            break
+        if '<entry><type>' not in line:
+            continue
+
+        # Parse a string such as
+        # time [ (<replaceable>p</replaceable>) ] [ without time zone ]
+        # into types "time" and "without time zone"
+
+        # remove all the tags
+        line = re.sub("<replaceable>[^<]+</replaceable>", "", line)
+        line = re.sub("<[^>]+>", "", line)
+
+        # Drop the parts containing braces
+        for tmp in [ t for tmp in line.split('[') for t in tmp.split(']') if "(" not in t ]:
+            for t in tmp.split(','):
+                t = t.strip()
+                if not t: continue
+                dt.add(" ".join(t.split()))
+
+    dt = list(dt)
+    dt.sort()
+    return dt
+
+def fetch(url):
+    return urllib2.urlopen(url)
+
+def update_consts(filename, constname, content):
+    f = open(filename)
+    lines = f.readlines()
+    f.close()
+
+    # Line to start/end inserting
+    re_start = re.compile(r'^%s\s*=\s*\[\s*$' % constname)
+    re_end = re.compile(r'^\s*\]\s*$')
+    start = [ n for n, l in enumerate(lines) if re_start.match(l) ]
+    if not start:
+        raise ValueError("couldn't find line containing '%s = ['" % constname)
+    if len(start) > 1:
+        raise ValueError("too many lines containing '%s = ['" % constname)
+    start = start[0] + 1
+
+    end = [ n for n, l in enumerate(lines) if n >= start and re_end.match(l) ]
+    if not end:
+        raise ValueError("couldn't find line containing ']' after %s " % constname)
+    end = end[0]
+
+    # Pack the new content in lines not too long
+    content = [repr(item) for item in content ]
+    new_lines = [[]]
+    for item in content:
+        if sum(map(len, new_lines[-1])) + 2 * len(new_lines[-1]) + len(item) + 4 > 75:
+            new_lines.append([])
+        new_lines[-1].append(item)
+
+    lines[start:end] = [ "    %s,\n" % ", ".join(items) for items in new_lines ]
+
+    f = open(filename, 'w')
+    f.write(''.join(lines))
+    f.close()
+
+
+# Autogenerated: please edit them if you like wasting your time.
+
+KEYWORDS = [
+    ]
+
+DATATYPES = [
+    ]
+
+
+if __name__ == '__main__':
+    update_myself()
+

File pygments/lexers/postgres.py

+"""
+    pygments.lexers.postgres
+    ~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Lexers for PostgreSQL-specific SQL and psql interactive session.
+
+    :copyright: Copyright 2011 by Daniele Varrazzo.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+import urllib2
+
+from pygments.lexer import Lexer, RegexLexer, include, bygroups, using, \
+     this, do_insertions
+from pygments.token import Error, Punctuation, Literal, Token, \
+     Text, Comment, Operator, Keyword, Name, String, Number, Generic
+
+from pygments.lexers._postgres_builtins import KEYWORDS, DATATYPES
+
+
+__all__ = [ 'PostgresLexer', 'PostgresConsoleLexer' ]
+
+line_re  = re.compile('.*?\n')
+
+
+class PostgresLexer(RegexLexer):
+    """
+    Lexer for the PostgreSQL dialect of SQL.
+    """
+
+    name = 'PostgreSQL SQL dialect'
+    aliases = ['postgresql', 'postgres']
+    mimetypes = ['text/x-postgresql']
+
+    flags = re.IGNORECASE
+    tokens = {
+        'root': [
+            (r'\s+', Text),
+            (r'--.*?\n', Comment.Single),
+            (r'/\*', Comment.Multiline, 'multiline-comments'),
+            (r'(' + '|'.join(KEYWORDS) + r')\b', Keyword),
+            (r'(' + '|'.join([s.replace(" ", "\s+") for s in DATATYPES])
+                  + r')\b', Name.Builtin),
+            (r'[+*/<>=~!@#%^&|`?^-]', Operator),
+            (r'::', Operator),  # cast
+            (r'([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?', Number.Float),
+            (r'[0-9]+', Number.Integer),
+            # TODO: Backslash escapes?
+            (r"'(''|[^'])*'", String.Single),
+            (r'"(""|[^"])*"', String.Name), # quoted identifier
+            (r'[a-zA-Z_][a-zA-Z0-9_]*', Name),
+            (r'[;:()\[\],\.]', Punctuation),
+            # psql backslash command.
+            # This actually belongs to the console lexer,
+            # but putting it here makes things easier.
+            (r'\\.*?\n', Name),             # TODO: what is a good token?
+        ],
+        'multiline-comments': [
+            (r'/\*', Comment.Multiline, 'multiline-comments'),
+            (r'\*/', Comment.Multiline, '#pop'),
+            (r'[^/\*]+', Comment.Multiline),
+            (r'[/*]', Comment.Multiline)
+        ]
+    }
+
+re_prompt = re.compile(r'^([a-zA-Z_][a-zA-Z0-9_]+)?[=\-\(]#')
+re_psql_command = re.compile(r'(\s*)(\\.+?)(\s+)$')
+re_error = re.compile(r'ERROR:')
+re_message = re.compile(r'(DEBUG|INFO|WARNING|ERROR|HINT|LINE [0-9]+:?)(.*?\n)')
+re_charhint = re.compile(r'\s*\^\s*\n')
+
+class PostgresConsoleLexer(Lexer):
+    """
+    Lexer for psql sessions.
+
+    TODO: multiline comments are broken.
+    """
+
+    name = 'PostgreSQL console (psql)'
+    aliases = ['psql', 'postgresql-console', 'postgres-console']
+    mimetypes = ['text/x-postgresql-psql']
+
+    def get_tokens_unprocessed(self, data):
+        sql = PostgresLexer(**self.options)
+
+        curcode = ''
+        insertions = []
+        out_token = Generic.Output
+        for match in line_re.finditer(data):
+            line = match.group()
+            mprompt = re_prompt.match(line)
+            if mprompt is not None:
+                out_token = Generic.Output
+                insertions.append((len(curcode),
+                                   [(0, Generic.Prompt, mprompt.group())]))
+                curcode += line[len(mprompt.group()):]
+            else:
+                if curcode:
+                    for item in do_insertions(insertions,
+                                              sql.get_tokens_unprocessed(curcode)):
+                        yield item
+                    curcode = ''
+                    insertions = []
+                mmsg = re_message.match(line)
+                if mmsg is not None:
+                    if mmsg.group(1).startswith("ERROR"):
+                        out_token = Generic.Error
+                    yield (mmsg.start(1), Generic.Strong, mmsg.group(1))
+                    yield (mmsg.start(2), out_token, mmsg.group(2))
+                elif re_charhint.match(line):
+                    yield (match.start(), out_token, line)
+                else:
+                    yield (match.start(), Generic.Output, line)
+
+        if curcode:
+            for item in do_insertions(insertions,
+                                      sql.get_tokens_unprocessed(curcode)):
+                yield item
+
+

File tests/test_basic_api.py

         if cls.__name__ not in (
             'PythonConsoleLexer', 'RConsoleLexer', 'RubyConsoleLexer',
             'SqliteConsoleLexer', 'MatlabSessionLexer', 'ErlangShellLexer',
-            'BashSessionLexer', 'LiterateHaskellLexer'):
+            'BashSessionLexer', 'LiterateHaskellLexer', 'PostgresConsoleLexer'):
             inst = cls(ensurenl=False)
             ensure(inst.get_tokens('a\nb'), 'a\nb')
             inst = cls(ensurenl=False, stripall=True)