Source

jslex / jslex.py

Full commit
Ned Batchelder aa28a53 
Ned Batchelder 9e5997d 
Ned Batchelder aa28a53 
Ned Batchelder 719817f 


Ned Batchelder 5b650d4 

Ned Batchelder 719817f 








Ned Batchelder 05fe9b3 







Ned Batchelder 719817f 













Ned Batchelder 8451e2a 
Ned Batchelder 719817f 





Ned Batchelder 5b650d4 
Ned Batchelder 719817f 

Ned Batchelder afd13f1 















Ned Batchelder 719817f 
Ned Batchelder afd13f1 

Ned Batchelder 719817f 






Ned Batchelder afd13f1 
Ned Batchelder 719817f 













Ned Batchelder 8451e2a 


Ned Batchelder 719817f 









Ned Batchelder 05fe9b3 

Ned Batchelder 8e297f7 



Ned Batchelder 719817f 
Ned Batchelder 5b650d4 
Ned Batchelder 719817f 










































Ned Batchelder d6c6c11 
Ned Batchelder 719817f 







Ned Batchelder d6c6c11 
Ned Batchelder 719817f 




















Ned Batchelder 2adfe36 







Ned Batchelder 719817f 






Ned Batchelder 2adfe36 





Ned Batchelder 2270dbe 




Ned Batchelder 719817f 
"""JsLex: a lexer for Javascript"""
# From https://bitbucket.org/ned/jslex

import re

class Tok(object):
    """A specification for a token class."""

    num = 0

    def __init__(self, name, regex, next=None):
        self.id = Tok.num
        Tok.num += 1
        self.name = name
        self.regex = regex
        self.next = next

def literals(choices, prefix="", suffix=""):
    """Create a regex from a space-separated list of literal `choices`.
    
    If provided, `prefix` and `suffix` will be attached to each choice
    individually.

    """
    return "|".join(prefix+re.escape(c)+suffix for c in choices.split())

class Lexer(object):
    """A generic multi-state regex-based lexer."""

    def __init__(self, states, first):
        self.regexes = {}
        self.toks = {}

        for state, rules in states.items():
            parts = []
            for tok in rules:
                groupid = "t%d" % tok.id
                self.toks[groupid] = tok
                parts.append("(?P<%s>%s)" % (groupid, tok.regex))
            self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)

        self.state = first

    def lex(self, text):
        """Lexically analyze `text`.

        Yields pairs (`name`, `tokentext`).

        """
        end = len(text)
        state = self.state
        regexes = self.regexes
        toks = self.toks
        start = 0

        while start < end:
            for match in regexes[state].finditer(text, start):
                name = match.lastgroup
                tok = toks[name]
                toktext = match.group(name)
                start += len(toktext)
                yield (tok.name, toktext)

                if tok.next:
                    state = tok.next
                    break

        self.state = state


class JsLexer(Lexer):
    """A Javascript lexer
    
    >>> lexer = JsLexer()
    >>> list(lexer.lex("a = 1"))
    [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]

    This doesn't properly handle non-Ascii characters in the Javascript source.

    """

    # Because these tokens are matched as alternatives in a regex, longer possibilities
    # must appear in the list before shorter ones, for example, '>>' before '>'.
    #
    # Note that we don't have to detect malformed Javascript, only properly lex
    # correct Javascript, so much of this is simplified.

    # Details of Javascript lexical structure are taken from
    # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf

    # A useful explanation of automatic semicolon insertion is at
    # http://inimino.org/~inimino/blog/javascript_semicolons

    both_before = [
        Tok("comment",      r"/\*(.|\n)*?\*/"),
        Tok("linecomment",  r"//.*?$"),
        Tok("ws",           r"\s+"),
        Tok("keyword",      literals("""
                                break case catch class const continue debugger
                                default delete do else enum export extends
                                finally for function if import in instanceof new
                                return super switch this throw try typeof var
                                void while with
                                """, suffix=r"\b"), next='reg'),
        Tok("reserved",     literals("null true false", suffix=r"\b"), next='div'),
        Tok("id",           r"""
                            ([a-zA-Z_$   ]|\\u[0-9a-fA-Z]{4})       # first char
                            ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})*      # rest chars
                            """, next='div'),
        Tok("hnum",         r"0[xX][0-9a-fA-F]+", next='div'),
        Tok("onum",         r"0[0-7]+"),
        Tok("dnum",         r"""
                            (   (0|[1-9][0-9]*)         # DecimalIntegerLiteral
                                \.                      # dot
                                [0-9]*                  # DecimalDigits-opt
                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
                            |   
                                \.                      # dot
                                [0-9]+                  # DecimalDigits
                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
                            |   
                                (0|[1-9][0-9]*)         # DecimalIntegerLiteral
                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
                            )
                            """, next='div'),
        Tok("punct",        literals("""
                                >>>= === !== >>> <<= >>= <= >= == != << >> && 
                                || += -= *= %= &= |= ^=
                                """), next="reg"),
        Tok("punct",        literals("++ -- ) ]"), next='div'),
        Tok("punct",        literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
        Tok("string",       r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
        Tok("string",       r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
        ]

    both_after = [
        Tok("other",        r"."),
        ]

    states = {
        'div': # slash will mean division
            both_before + [
            Tok("punct", literals("/= /"), next='reg'),
            ] + both_after,

        'reg':  # slash will mean regex
            both_before + [
            Tok("regex",        
                r"""
                    /                       # opening slash
                    # First character is..
                    (   [^*\\/[]            # anything but * \ / or [
                    |   \\.                 # or an escape sequence
                    |   \[                  # or a class, which has
                            (   [^\]\\]     #   anything but \ or ]
                            |   \\.         #   or an escape sequence
                            )*              #   many times
                        \]
                    )
                    # Following characters are same, except for excluding a star
                    (   [^\\/[]             # anything but \ / or [
                    |   \\.                 # or an escape sequence
                    |   \[                  # or a class, which has
                            (   [^\]\\]     #   anything but \ or ]
                            |   \\.         #   or an escape sequence
                            )*              #   many times
                        \]
                    )*                      # many times
                    /                       # closing slash
                    [a-zA-Z0-9]*            # trailing flags
                """, next='div'),
            ] + both_after,
        }

    def __init__(self):
        super(JsLexer, self).__init__(self.states, 'reg')


def js_to_c_for_gettext(js):
    """Convert the Javascript source `js` into something resembling C for xgettext.
    
    What actually happens is that all the regex literals are replaced with
    "REGEX".
    
    """
    def escape_quotes(m):
        """Used in a regex to properly escape double quotes."""
        s = m.group(0)
        if s == '"':
            return r'\"'
        else:
            return s
                    
    lexer = JsLexer()
    c = []
    for name, tok in lexer.lex(js):
        if name == 'regex':
            # C doesn't grok regexes, and they aren't needed for gettext,
            # so just output a string instead.
            tok = '"REGEX"';
        elif name == 'string':
            # C doesn't have single-quoted strings, so make all strings
            # double-quoted.
            if tok.startswith("'"):
                guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
                tok = '"' + guts + '"'
        elif name == 'id':
            # C can't deal with Unicode escapes in identifiers.  We don't
            # need them for gettext anyway, so replace them with something
            # innocuous
            tok = tok.replace("\\", "U");
        c.append(tok)
    return ''.join(c)