jslex / jslex.py

"""JsLex: a lexer for Javascript"""

import re

class Tok(object):
    """A specification for a token class."""

    num = 0

    def __init__(self, name, regex, next=None):
        self.id = Tok.num
        Tok.num += 1
        self.name = name
        self.regex = regex
        self.next = next

def literals(choices, prefix="", suffix=""):
    """Create a regex from a space-separated list of literal `choices`.
    
    If provided, `prefix` and `suffix` will be attached to each choice
    individually.

    """
    return "|".join(prefix+re.escape(c)+suffix for c in choices.split())

class Lexer(object):
    """A generic multi-state regex-based lexer."""

    def __init__(self, states, first):
        self.regexes = {}
        self.toks = {}

        for state, rules in states.items():
            parts = []
            for tok in rules:
                groupid = "t%d" % tok.id
                self.toks[groupid] = tok
                parts.append("(?P<%s>%s)" % (groupid, tok.regex))
            self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)

        self.state = first

    def lex(self, text):
        """Lexically analyze `text`.

        Yields pairs (`name`, `tokentext`).

        """
        while text:
            eaten = 0
            for match in self.regexes[self.state].finditer(text):
                for name, toktext in match.groupdict().iteritems():
                    if toktext is not None:
                        tok = self.toks[name]
                        new_state = tok.next
                        eaten += len(toktext)
                        yield (tok.name, toktext)
                if new_state:
                    self.state = new_state
                    break
            text = text[eaten:]


class JsLexer(Lexer):
    """A Javascript lexer
    
    >>> lexer = JsLexer()
    >>> list(lexer.lex("a = 1"))
    [("id", "a"), ("ws", " "), ("punct", "="), ("ws", " "), ("dnum 1")]

    This doesn't properly handle non-Ascii characters in the Javascript source.

    """

    # Because these tokens are matched as alternatives in a regex, longer possibilities
    # must appear in the list before shorter ones, for example, '>>' before '>'.
    #
    # Note that we don't have to detect malformed Javascript, only properly lex
    # correct Javascript, so much of this is simplified.

    # Details of Javascript lexical structure are taken from
    # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf

    # A useful explanation of automatic semicolon insertion is at
    # http://inimino.org/~inimino/blog/javascript_semicolons

    both_before = [
        Tok("comment",      r"/\*(.|\n)*?\*/"),
        Tok("linecomment",  r"//.*?$"),
        Tok("ws",           r"\s+"),
        Tok("keyword",      literals("""
                                break case catch class const continue debugger
                                default delete do else enum export extends
                                finally for function if import in instanceof new
                                return super switch this throw try typeof var
                                void while with
                                """, suffix=r"\b"), next='reg'),
        Tok("reserved",     literals("null true false", suffix=r"\b"), next='div'),
        Tok("id",           r"([a-zA-Z_$]|\\u[0-9a-fA-Z]{4})([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})*", next='div'),
        Tok("hnum",         r"0[xX][0-9a-fA-F]+", next='div'),
        Tok("onum",         r"0[0-7]+"),
        Tok("dnum",         r"""
                            (   (0|[1-9][0-9]*)         # DecimalIntegerLiteral
                                \.                      # dot
                                [0-9]*                  # DecimalDigits-opt
                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
                            |   
                                \.                      # dot
                                [0-9]+                  # DecimalDigits
                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
                            |   
                                (0|[1-9][0-9]*)         # DecimalIntegerLiteral
                                ([eE][-+]?[0-9]+)?      # ExponentPart-opt
                            )
                            """, next='div'),
        Tok("punct",        literals("""
                                >>>= === !== >>> <<= >>= <= >= == != << >> && 
                                || += -= *= %= &= |= ^=
                                """), next="reg"),
        Tok("punct",        literals("++ -- ) ]"), next='div'),
        Tok("punct",        literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
        Tok("string",       r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
        Tok("string",       r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
        ]

    both_after = [
        Tok("other",        r"."),
        ]

    states = {
        'div': # slash will mean division
            both_before + [
            Tok("punct", literals("/= /"), next='reg'),
            ] + both_after,

        'reg':  # slash will mean regex
            both_before + [
            Tok("regex",        
                r"""
                    /                       # opening slash
                    # First character is..
                    (   [^*\\/[]            # anything but * \ / or [
                    |   \\.                 # or an escape sequence
                    |   \[                  # or a class, which has
                            (   [^\]\\]     #   anything but \ or ]
                            |   \\.         #   or an escape sequence
                            )*              #   many times
                        \]
                    )
                    # Following characters are same, except for excluding a star
                    (   [^\\/[]             # anything but \ / or [
                    |   \\.                 # or an escape sequence
                    |   \[                  # or a class, which has
                            (   [^\]\\]     #   anything but \ or ]
                            |   \\.         #   or an escape sequence
                            )*              #   many times
                        \]
                    )*                      # many times
                    /                       # closing slash
                    [a-zA-Z0-9]*            # trailing flags
                """, next='div'),
            ] + both_after,
        }

    def __init__(self):
        super(JsLexer, self).__init__(self.states, 'reg')


def js_to_c_for_gettext(js):
    """Convert the Javascript source `js` into something resembling C for xgettext.
    
    What actually happens is that all the regex literals are replaced with
    "REGEX".
    
    """
    lexer = JsLexer()
    c = []
    for name, tok in lexer.lex(js):
        if name == 'regex':
            # C doesn't grok regexes, and they aren't needed for gettext,
            # so just output a string instead.
            tok = '"REGEX"';
        c.append(tok)
    return ''.join(c)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.