Commits

Ned Batchelder committed 719817f

A good weekend's work.

  • Participants

Comments (0)

Files changed (2)

+import re
+
+class Tok(object):
+    num = 0
+
+    def __init__(self, name, regex, next=None):
+        self.id = Tok.num
+        Tok.num += 1
+        self.name = name
+        self.regex = regex
+        self.next = next
+
+def literals(choices):
+    """Create a regex from a space-separated list of literal `choices`."""
+    return "|".join([re.escape(c) for c in choices.split()])
+
+class Lexer(object):
+    """A generic multi-state regex-based lexer."""
+
+    def __init__(self, states, first):
+        self.regexes = {}
+        self.toks = {}
+
+        for state, rules in states.items():
+            parts = []
+            for tok in rules:
+                groupid = "t%d" % tok.id
+                self.toks[groupid] = tok
+                parts.append("(?P<%s>%s)" % (groupid, tok.regex))
+            self.regexes[state] = re.compile("|".join(parts), re.MULTILINE+re.VERBOSE)
+
+        self.state = first
+
+    def lex(self, text):
+        """Lexically analyze `text`.
+
+        Yields pairs (`name`, `token`).
+
+        """
+        while text:
+            eaten = 0
+            for match in self.regexes[self.state].finditer(text):
+                for name, toktext in match.groupdict().iteritems():
+                    if toktext is not None:
+                        tok = self.toks[name]
+                        new_state = tok.next
+                        eaten += len(toktext)
+                        yield (tok.name, toktext)
+                if new_state:
+                    self.state = new_state
+                    break
+            text = text[eaten:]
+
+
+class JsLexer(Lexer):
+    """A Javascript lexer
+    
+    >>> lexer = JsLexer()
+    >>> list(lexer.lex("a = 1"))
+    [("id", "a"), ("ws", " "), ("punct", "="), ("ws", " "), ("dnum 1")]
+
+    This doesn't properly handle non-Ascii characters in the Javascript source.
+
+    """
+
+    # Because these tokens are matched as alternatives in a regex, longer possibilities
+    # must appear in the list before shorter ones, for example, '>>' before '>'.
+    #
+    # Note that we don't have to detect malformed Javascript, only properly lex
+    # correct Javascript, so much of this is simplified.
+
+    # Details of Javascript lexical structure are taken from
+    # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
+
+    both_before = [
+        Tok("comment",      r"/\*(.|\n)*?\*/"),
+        Tok("linecomment",  r"//.*?$"),
+        Tok("ws",           r"\s+"),
+        Tok("keyword",      literals("""
+                                break case catch class const continue debugger
+                                default delete do else enum export extends
+                                finally for function if import in instanceof new
+                                return super switch this throw try typeof var
+                                void while with
+                                """), next='reg'),
+        Tok("reserved",     literals("null true false"), next='div'),
+        Tok("id",           r"([a-zA-Z_$]|\\u[0-9a-fA-Z]{4})([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})*", next='div'),
+        Tok("hnum",         r"0[xX][0-9a-fA-F]+", next='div'),
+        Tok("dnum",         r"""
+                            (   (0|[1-9][0-9]*)         # DecimalIntegerLiteral
+                                \.                      # dot
+                                [0-9]*                  # DecimalDigits-opt
+                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
+                            |   
+                                \.                      # dot
+                                [0-9]+                  # DecimalDigits
+                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
+                            |   
+                                (0|[1-9][0-9]*)         # DecimalIntegerLiteral
+                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
+                            )
+                            """, next='div'),
+        Tok("punct",        literals("""
+                                >>>= === !== >>> <<= >>= <= >= == != << >> && 
+                                || += -= *= %= &= |= ^=
+                                """), next="reg"),
+        Tok("punct",        literals("++ -- ) ]"), next='div'),
+        Tok("punct",        literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
+        Tok("string",       r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
+        Tok("string",       r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
+        ]
+
+    both_after = [
+        Tok("other",        r"."),
+        ]
+
+    states = {
+        'div': # slash will mean division
+            both_before + [
+            Tok("punct", literals("/= /"), next='reg'),
+            ] + both_after,
+
+        'reg':  # slash will mean regex
+            both_before + [
+            Tok("regex",        
+                r"""
+                    /                       # opening slash
+                    # First character is..
+                    (   [^*\\/[]            # anything but * \ / or [
+                    |   \\.                 # or an escape sequence
+                    |   \[                  # or a class, which has
+                            (   [^\]\\]     #   anything but ] or \
+                            |   \\.         #   or an escape sequence
+                            )*              #   many times
+                        \]
+                    )
+                    # Following characters are same, except for excluding a star
+                    (   [^\\/[]             # anything but \ / or [
+                    |   \\.                 # or an escape sequence
+                    |   \[                  # or a class, which has
+                            (   [^\]\\]     #   anything but ] or \
+                            |   \\.         #   or an escape sequence
+                            )*              #   many times
+                        \]
+                    )*                      # many times
+                    /                       # closing slash
+                    [a-zA-Z0-9]*            # trailing flags
+                """, next='div'),
+            ] + both_after,
+        }
+
+    def __init__(self):
+        super(JsLexer, self).__init__(self.states, 'reg')
+
+
+def js_to_c_for_gettext(js):
+    """Convert the Javascript source `js` into something resembling C for xgettext.
+    
+    What actually happens is that all the regex literals are replaced with
+    "REGEX".
+    
+    """
+    lexer = JsLexer()
+    c = []
+    for name, tok in lexer.lex(js):
+        if name == 'regex':
+            # C doesn't grok regexes, and they aren't needed for gettext,
+            # so just output a string instead.
+            tok = '"REGEX"';
+        c.append(tok)
+    return ''.join(c)
+# Tests for jslex
+
+import difflib
+from unittest import TestCase
+from jslex import JsLexer, js_to_c_for_gettext
+
+class JsLexTestCase(TestCase):
+    def assertMultiLineEqual(self, first, second):
+        """Assert that two multi-line strings are equal.
+
+        If they aren't, show a nice diff.
+
+        """
+        if first != second:
+            message = ''.join(difflib.ndiff(first.splitlines(True), second.splitlines(True)))
+            self.fail("Multi-line strings are unequal:\n" + message)
+
+    def assertListsEqual(self, first, second):
+        if first != second:
+            lines1 = [repr(e) for e in first]
+            lines2 = [repr(e) for e in second]
+            message = '\n'.join(difflib.ndiff(lines1, lines2))
+            self.fail("Lists are unequal:\n" + message)
+
+
+
+class JsTokensTest(JsLexTestCase):
+    LEX_CASES = [
+        ("a ABC $ _ a123", ["id a", "id ABC", "id $", "id _", "id a123"]),    
+        (r"\u1234 abc\u0020 \u0065_\u0067", [r"id \u1234", r"id abc\u0020", r"id \u0065_\u0067"]),
+        ("123 1.234 0.123e-3 0 1E+40 1e1 .123", ["dnum 123", "dnum 1.234", "dnum 0.123e-3", "dnum 0", "dnum 1E+40", "dnum 1e1", "dnum .123"]),
+        ("0x1 0xabCD 0XABcd", ["hnum 0x1", "hnum 0xabCD", "hnum 0XABcd"]),
+        ("0xa123ghi", ["hnum 0xa123", "id ghi"]),
+        ("function Function FUNCTION", ["keyword function", "id Function", "id FUNCTION"]),
+        ("a//b", ["id a", "linecomment //b"]),
+        ("/****/a/=2//hello", ["comment /****/", "id a", "punct /=", "dnum 2", "linecomment //hello"]),
+        ("/*\n * Header\n */\na=1;", ["comment /*\n * Header\n */", "id a", "punct =", "dnum 1", "punct ;"]),
+        ("a+++b", ["id a", "punct ++", "punct +", "id b"]),
+        (r"a=/a*/,1", ["id a", "punct =", "regex /a*/", "punct ,", "dnum 1"]),
+        (r"a=/a*[^/]+/,1", ["id a", "punct =", "regex /a*[^/]+/", "punct ,", "dnum 1"]),
+        (r"a=/a*\[^/,1", ["id a", "punct =", r"regex /a*\[^/", "punct ,", "dnum 1"]),
+        (r"a=/\//,1", ["id a", "punct =", r"regex /\//", "punct ,", "dnum 1"]),
+
+        # next two are from http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
+        ("""for (var x = a in foo && "</x>" || mot ? z:/x:3;x<5;y</g/i) {xyz(x++);}""",
+            ["keyword for", "punct (", "keyword var", "id x", "punct =", "id a", "keyword in",
+            "id foo", "punct &&", 'string "</x>"', "punct ||", "id mot", "punct ?", "id z",
+            "punct :", "regex /x:3;x<5;y</g", "punct /", "id i", "punct )", "punct {",
+            "id xyz", "punct (", "id x", "punct ++", "punct )", "punct ;", "punct }"]),
+        ("""for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y</g/i) {xyz(x++);}""",
+            ["keyword for", "punct (", "keyword var", "id x", "punct =", "id a", "keyword in",
+            "id foo", "punct &&", 'string "</x>"', "punct ||", "id mot", "punct ?", "id z",
+            "punct /", "id x", "punct :", "dnum 3", "punct ;", "id x", "punct <", "dnum 5",
+            "punct ;", "id y", "punct <", "regex /g/i", "punct )", "punct {",
+            "id xyz", "punct (", "id x", "punct ++", "punct )", "punct ;", "punct }"]),
+        ]
+
+def make_function(input, toks):
+    def test_func(self):
+        lexer = JsLexer()
+        result = ["%s %s" % (name, tok) for name, tok in lexer.lex(input) if name != 'ws']
+        self.assertListsEqual(result, toks)
+    return test_func
+
+for i, (input, toks) in enumerate(JsTokensTest.LEX_CASES):
+    setattr(JsTokensTest, "test_case_%d" % i, make_function(input, toks))
+
+
+GETTEXT_CASES = r"""
+========================================
+a = 1; /* /[0-9]+/ */
+b = 0x2a0b / 1; // /[0-9]+/
+c = 3;
+--------------------
+a = 1; /* /[0-9]+/ */
+b = 0x2a0b / 1; // /[0-9]+/
+c = 3;
+========================================
+a = 1.234e-5;
+/*
+ * /[0-9+/
+ */
+b = .0123;
+--------------------
+a = 1.234e-5;
+/*
+ * /[0-9+/
+ */
+b = .0123;
+========================================
+x = y / z;
+alert(gettext("hello"));
+x /= 3;
+--------------------
+x = y / z;
+alert(gettext("hello"));
+x /= 3;
+========================================
+s = "Hello \"th/foo/ere\"";
+s = 'He\x23llo \'th/foo/ere\'';
+--------------------
+s = "Hello \"th/foo/ere\"";
+s = 'He\x23llo \'th/foo/ere\'';
+========================================
+s = "Line continuation\
+continued /hello/ still the string";/hello/;
+--------------------
+s = "Line continuation\
+continued /hello/ still the string";"REGEX";
+========================================
+var regex = /pattern/;
+var regex2 = /matter/gm;
+var regex3 = /[*/]+/gm.foo("hey");
+--------------------
+var regex = "REGEX";
+var regex2 = "REGEX";
+var regex3 = "REGEX".foo("hey");
+========================================
+for (var x = a in foo && "</x>" || mot ? z:/x:3;x<5;y</g/i) {xyz(x++);}
+for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y</g/i) {xyz(x++);}
+--------------------
+for (var x = a in foo && "</x>" || mot ? z:"REGEX"/i) {xyz(x++);}
+for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y<"REGEX") {xyz(x++);}
+========================================
+"""
+
+
+class JsToCForGettextTest(JsLexTestCase):
+    pass
+
+def make_function(js, c):
+    def test_func(self):
+        self.assertMultiLineEqual(js_to_c_for_gettext(js), c)
+    return test_func
+
+for i, pair in enumerate(GETTEXT_CASES.split('='*40+'\n')):
+    if not pair.strip():
+        continue
+    js, c = pair.split('-'*20+'\n')
+    setattr(JsToCForGettextTest, "test_case_%d" % i, make_function(js, c))
+