sphinx-work / sphinx / pycode /

Full commit
# -*- coding: utf-8 -*-

    Utilities parsing and analyzing Python code.

    :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.

import re
import sys
from os import path
from cStringIO import StringIO

from sphinx.errors import PycodeError
from sphinx.pycode import nodes
from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
from sphinx.util import get_module_source
from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc

# load the Python grammar
_grammarfile = path.join(path.dirname(__file__), 'Grammar.txt')
pygrammar = driver.load_grammar(_grammarfile)
pydriver = driver.Driver(pygrammar, convert=nodes.convert)

# an object with attributes corresponding to token and symbol names
class sym: pass
for k, v in pygrammar.symbol2number.iteritems():
    setattr(sym, k, v)
for k, v in token.tok_name.iteritems():
    setattr(sym, v, k)

# a dict mapping terminal and nonterminal numbers to their names
number2name = pygrammar.number2symbol.copy()

# a regex to recognize coding cookies
_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')

_eq = nodes.Leaf(token.EQUAL, '=')

class AttrDocVisitor(nodes.NodeVisitor):
    Visitor that collects docstrings for attribute assignments on toplevel and
    in classes (class attributes and attributes set in __init__).

    The docstrings can either be in special '#:' comments before the assignment
    or in a docstring after it.
    def init(self, scope, encoding):
        self.scope = scope
        self.in_init = 0
        self.encoding = encoding
        self.namespace = []
        self.collected = {}
        self.tagnumber = 0
        self.tagorder = {}

    def add_tag(self, name):
        name = '.'.join(self.namespace + [name])
        self.tagorder[name] = self.tagnumber
        self.tagnumber += 1

    def visit_classdef(self, node):
        """Visit a class."""

    def visit_funcdef(self, node):
        """Visit a function (or method)."""
        # usually, don't descend into functions -- nothing interesting there
        if node[1].value == '__init__':
            # however, collect attributes set in __init__ methods
            self.in_init += 1
            self.in_init -= 1

    def visit_expr_stmt(self, node):
        """Visit an assignment which may have a special comment before it."""
        if _eq not in node.children:
            # not an assignment (we don't care for augmented assignments)
        pnode = node[0]
        prefix = pnode.get_prefix()
        # if the assignment is the first statement on a new indentation
        # level, its preceding whitespace and comments are not assigned
        # to that token, but the first INDENT or DEDENT token
        while not prefix:
            pnode = pnode.get_prev_leaf()
            if not pnode or pnode.type not in (token.INDENT, token.DEDENT):
            prefix = pnode.get_prefix()
        prefix = prefix.decode(self.encoding)
        docstring = prepare_commentdoc(prefix)
        self.add_docstring(node, docstring)

    def visit_simple_stmt(self, node):
        """Visit a docstring statement which may have an assignment before."""
        if node[0].type != token.STRING:
            # not a docstring; but still need to visit children
            return self.generic_visit(node)
        prev = node.get_prev_sibling()
        if not prev:
        if prev.type == sym.simple_stmt and \
               prev[0].type == sym.expr_stmt and _eq in prev[0].children:
            # need to "eval" the string because it's returned in its
            # original form
            docstring = literals.evalString(node[0].value, self.encoding)
            docstring = prepare_docstring(docstring)
            self.add_docstring(prev[0], docstring)

    def add_docstring(self, node, docstring):
        # add an item for each assignment target
        for i in range(0, len(node) - 1, 2):
            target = node[i]
            if self.in_init and self.number2name[target.type] == 'power':
                # maybe an attribute assignment -- check necessary conditions
                if (# node must have two children
                    len(target) != 2 or
                    # first child must be "self"
                    target[0].type != token.NAME or target[0].value != 'self' or
                    # second child must be a "trailer" with two children
                    self.number2name[target[1].type] != 'trailer' or
                    len(target[1]) != 2 or
                    # first child must be a dot, second child a name
                    target[1][0].type != token.DOT or
                    target[1][1].type != token.NAME):
                name = target[1][1].value
            elif target.type != token.NAME:
                # don't care about other complex targets
                name = target.value
            if docstring:
                namespace = '.'.join(self.namespace)
                if namespace.startswith(self.scope):
                    self.collected[namespace, name] = docstring

class ModuleAnalyzer(object):
    # cache for analyzer objects -- caches both by module and file name
    cache = {}

    def for_string(cls, string, modname, srcname='<string>'):
        return cls(StringIO(string), modname, srcname)

    def for_file(cls, filename, modname):
        if ('file', filename) in cls.cache:
            return cls.cache['file', filename]
            fileobj = open(filename, 'r')
        except Exception, err:
            raise PycodeError('error opening %r' % filename, err)
        obj = cls(fileobj, modname, filename)
        cls.cache['file', filename] = obj
        return obj

    def for_module(cls, modname):
        if ('module', modname) in cls.cache:
            entry = cls.cache['module', modname]
            if isinstance(entry, PycodeError):
                raise entry
            return entry

            type, source = get_module_source(modname)
            if type == 'string':
                obj = cls.for_string(source, modname)
                obj = cls.for_file(source, modname)
        except PycodeError, err:
            cls.cache['module', modname] = err
        cls.cache['module', modname] = obj
        return obj

    def __init__(self, source, modname, srcname):
        # name of the module
        self.modname = modname
        # name of the source file
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source
        # will be changed when found by parse()
        self.encoding = sys.getdefaultencoding()

        # cache the source code as well
        pos = self.source.tell()
        self.code =

        # will be filled by tokenize()
        self.tokens = None
        # will be filled by parse()
        self.parsetree = None
        # will be filled by find_attr_docs()
        self.attr_docs = None
        self.tagorder = None
        # will be filled by find_tags()
        self.tags = None

    def tokenize(self):
        """Generate tokens from the source."""
        if self.tokens is not None:
        self.tokens = list(tokenize.generate_tokens(self.source.readline))

    def parse(self):
        """Parse the generated source tokens."""
        if self.parsetree is not None:
            self.parsetree = pydriver.parse_tokens(self.tokens)
        except parse.ParseError, err:
            raise PycodeError('parsing failed', err)
        # find the source code encoding, if present
        comments = self.parsetree.get_prefix()
        for line in comments.splitlines()[:2]:
            match =
            if match is not None:
                self.encoding =

    def find_attr_docs(self, scope=''):
        """Find class and module-level attributes and their documentation."""
        if self.attr_docs is not None:
            return self.attr_docs
        attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
        self.attr_docs = attr_visitor.collected
        self.tagorder = attr_visitor.tagorder
        # now that we found everything we could in the tree, throw it away
        # (it takes quite a bit of memory for large modules)
        self.parsetree = None
        return attr_visitor.collected

    def find_tags(self):
        """Find class, function and method definitions and their location."""
        if self.tags is not None:
            return self.tags
        result = {}
        namespace = []
        stack = []
        indent = 0
        defline = False
        expect_indent = False
        def tokeniter(ignore = (token.COMMENT, token.NL)):
            for tokentup in self.tokens:
                if tokentup[0] not in ignore:
                    yield tokentup
        tokeniter = tokeniter()
        for type, tok, spos, epos, line in tokeniter:
            if expect_indent:
                if type != token.INDENT:
                    # no suite -- one-line definition
                    assert stack
                    dtype, fullname, startline, _ = stack.pop()
                    endline = epos[0]
                    result[fullname] = (dtype, startline, endline)
                expect_indent = False
            if tok in ('def', 'class'):
                name =[1]
                fullname = '.'.join(namespace)
                stack.append((tok, fullname, spos[0], indent))
                defline = True
            elif type == token.INDENT:
                expect_indent = False
                indent += 1
            elif type == token.DEDENT:
                indent -= 1
                # if the stacklevel is the same as it was before the last
                # def/class block, this dedent closes that block
                if stack and indent == stack[-1][3]:
                    dtype, fullname, startline, _ = stack.pop()
                    endline = spos[0]
                    result[fullname] = (dtype, startline, endline)
            elif type == token.NEWLINE:
                # if this line contained a definition, expect an INDENT
                # to start the suite; if there is no such INDENT
                # it's a one-line definition
                if defline:
                    defline = False
                    expect_indent = True
        self.tags = result
        return result

if __name__ == '__main__':
    import time, pprint
    x0 = time.time()
    #ma = ModuleAnalyzer.for_file(__file__.rstrip('c'), '')
    ma = ModuleAnalyzer.for_file('sphinx/',
    x1 = time.time()
    x2 = time.time()
    #for (ns, name), doc in ma.find_attr_docs().iteritems():
    #    print '>>', ns, name
    #    print '\n'.join(doc)
    x3 = time.time()
    #print nodes.nice_repr(ma.parsetree, number2name)
    print "tokenizing %.4f, parsing %.4f, finding %.4f" % (x1-x0, x2-x1, x3-x2)