hgbook / web / hgbook / converter.py

from lxml import etree
from lxml import html
from lxml.cssselect import CSSSelector
import md5
import sys

args = sys.argv[1:]

# django stuff
from django.core.management import setup_environ
import settings # Assumed to be in the same directory.
setup_environ(settings)       # ugly django collateral effects :(
from comments.models import Element

doc_id = 'MMSC'
sel = CSSSelector('div.chapter p, pre, h1, table.equation')
chapter_sel = CSSSelector('div.chapter')

    filename = args[0]
except IndexError:
    raise IndexError("Usage: %s <path-to-html-file>" % __file__)

tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True))
root = tree.getroot()

chapter = chapter_sel(root)[0]
chapter_title = chapter.get('id').split(':')[1]
chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest()

chapter.set('id', chapter_hash)

for element in sel(root):
    hsh_source = element.text or element.get('alt') or etree.tostring(element)

    if hsh_source:
        hsh_source_encoded = hsh_source.encode('utf8')
        hsh = md5.new(hsh_source_encoded).hexdigest()
        element.set('id', '%s-%s' % (chapter_hash, hsh))
        # create the commentable element in the DB
        e = Element()
        e.id = '%s-%s' % (chapter_hash, hsh)
        e.chapter = chapter_hash
        e.title = chapter_title

print etree.tostring(root)      # pipe to a file if you wish
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.