n0v4pl4n3t / html5lib / treewalkers /

"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizer

To create a tree walker for a new type of tree, you need to do
implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens.

treeWalkerCache = {}

def getTreeWalker(treeType, implementation=None, **kwargs):
    """Get a TreeWalker class for various types of tree with built-in support

    treeType - the name of the tree type required (case-insensitive). Supported
               values are "simpletree", "dom", "etree" and "beautifulsoup"

               "simpletree" - a built-in DOM-ish tree type with support for some
                              more pythonic idioms.
                "dom" - The xml.dom.minidom DOM implementation
                "pulldom" - The xml.dom.pulldom event stream
                "etree" - A generic walker for tree implementations exposing an
                          elementtree-like interface (known to work with
                          ElementTree, cElementTree and lxml.etree).
                "lxml" - Optimized walker for lxml.etree
                "beautifulsoup" - Beautiful soup (if installed)
                "genshi" - a Genshi stream

    implementation - (Currently applies to the "etree" tree type only). A module
                      implementing the tree type e.g. xml.etree.ElementTree or

    treeType = treeType.lower()
    if treeType not in treeWalkerCache:
        if treeType in ("dom", "pulldom", "simpletree"):
            mod = __import__(treeType, globals())
            treeWalkerCache[treeType] = mod.TreeWalker
        elif treeType == "genshi":
            import genshistream
            treeWalkerCache[treeType] = genshistream.TreeWalker
        elif treeType == "beautifulsoup":
            import soup
            treeWalkerCache[treeType] = soup.TreeWalker
        elif treeType == "lxml":
            import lxmletree
            treeWalkerCache[treeType] = lxmletree.TreeWalker
        elif treeType == "etree":
            import etree
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)