ferrybox /

Full commit
# $Id$
# element loader based on BeautifulSoup

import BeautifulSoup as BS

# soup classes that are left out of the tree
ignorable_soup = BS.Comment, BS.Declaration, BS.ProcessingInstruction

# slightly silly
    import xml.etree.cElementTree as ET
except ImportError:
        import cElementTree as ET
    except ImportError:
        import elementtree.ElementTree as ET

import htmlentitydefs, re

pattern = re.compile("&(\w+);")

    name2codepoint = htmlentitydefs.name2codepoint
except AttributeError:
    # Emulate name2codepoint for Python 2.2 and earlier
    name2codepoint = {}
    for name, entity in htmlentitydefs.entitydefs.items():
        if len(entity) == 1:
            name2codepoint[name] = ord(entity)
            name2codepoint[name] = int(entity[2:-1])

def unescape(string):
    # work around oddities in BeautifulSoup's entity handling
    def unescape_entity(m):
            return unichr(name2codepoint[])
        except KeyError:
            return # use as is
    return pattern.sub(unescape_entity, string)

# Loads an XHTML or HTML file into an Element structure, using Leonard
# Richardson's tolerant BeautifulSoup parser.
# @param file Source file (either a file object or a file name).
# @param builder Optional tree builder.  If omitted, defaults to the
#     "best" available <b>TreeBuilder</b> implementation.
# @return An Element instance representing the HTML root element.

def parse(file, builder=None, encoding=None):
    bob = builder
    def emit(soup):
        if isinstance(soup, BS.NavigableString):
            if isinstance(soup, ignorable_soup):
            attrib = dict([(k, unescape(v)) for k, v in soup.attrs])
            bob.start(, attrib)
            for s in soup:
    # determine encoding (the document charset is not reliable)
    if not hasattr(file, "read"):
        file = open(file)
    text =
    if not encoding:
            encoding = "utf-8"
            unicode(text, encoding)
        except UnicodeError:
            encoding = "iso-8859-1"
    soup = BS.BeautifulSoup(
        text, convertEntities="html", fromEncoding=encoding
    # build the tree
    if not bob:
        bob = ET.TreeBuilder()
    root = bob.close()
    # wrap the document in a html root element, if necessary
    if len(root) == 1 and root[0].tag == "html":
        return root[0]
    root.tag = "html"
    return root

if __name__ == "__main__":
    import sys
    source = sys.argv[1]
    if source.startswith("http:"):
        import urllib
        source = urllib.urlopen(source)
    print ET.tostring(parse(source))