headingdef = ['h1','h2','h3','h4','h5','h6']
+NEWROW = 'command:newrow'
+NEWCOL = 'command:newcolumn'
+ return x.startswith('command:')
+ return x.startswith('class:')
# First, all h1, h2 etc tags will be children of the root.
# remove_tag should have ensured that, otherwise we will be unable
# to cut the HTML into sections.
- for level, h in headers:
+ for level, h in headers:
parent = get_parent(root, h)
+ raise BadStructure("Section heading \"%(name)s\" is not at the top level of "
+ "the document. This interferes with the ability to "
+ "format the sections and apply columns. "
+ "Please move the heading out of the '%(element)s'"
+ " element that contains it." % dict(name=name, element=parent.tag))
+ return dict((v,k) for (k,v) in d.items())
+def _apply_commands(root, section_nodes, styleinfo, headers):
+ # - No nesting of columns within columns
+ # - Within a given row, newcolumn must be applied to
+ # divs that are at the same level.
+ # - No columns allowed if newrow has not been started.
+ # Headers has the sections in document order
+ sections = [(level, name, section_nodes[name])
+ for level, name, n in headers]
+ known_nodes = _invert_dict(section_nodes)
+ # - insert 'newcolumn' on everything that has 'newrow'
+ for level, name, hn in headers:
+ if NEWROW in styleinfo[name]:
+ _add_rows_and_columns(root, known_nodes, styleinfo)
+ # TODO: due to HTML/CSS quirks, we may need to add an empty <div
+ # class="rowclear"> after every <div class="row">
+def _add_rows_and_columns(topnode, known_nodes, styleinfo):
+ children = topnode.getchildren()
+ for idx, node in enumerate(children):
+ name = known_nodes.get(node)
+ # If not a section node, it cannot contain sections.
+ commands = styleinfo[name]
+ if cur_row_start is not None:
+ # The previous row is finished
+ _apply_row_col_divs(topnode, cur_row_start_idx, idx, columns)
+ cur_row_start_idx = idx
+ if cur_row_start is None:
+ raise BadStructure("'New column' command was found on section "
+ "'%(name)s' without an appropriate 'new row' "
+ "command before it. " % dict(name=name))
+ columns.append((idx, node))
+ # Rows/columns can only be added within the same level of nesting
+ # of the HTML document. This means we do not need to recurse if
+ # we have started adding rows/columns.
+ _add_rows_and_columns(node, known_nodes, styleinfo)
+ # However, it would be good to recurse and check that no
+ # NEWROW/COL commands were found, and warn the user if
+ # If we are at last node, and are still in a row, there won't
+ # be a NEWROW command, so we have to close implicitly,
+ # including the current node in the row (hence idx + 1).
+ if idx == len(children) - 1 and cur_row_start is not None \
+ _apply_row_col_divs(topnode, cur_row_start_idx, idx + 1, columns)
+def _apply_row_col_divs(parent, start_idx, stop_idx, columns):
+ newrow = wrap_elements_in_tag(parent, start_idx, stop_idx, 'div')
+ newrow.set('class', 'row%dcol' % len(columns))
+ # The idx in 'columns' are all out now, due to having pulled the
+ # nodes out. Fix them up, and add a dummy entry to provide the
+ # 'stop_idx' for the last column.
+ columns = [(idx - start_idx, node) for (idx, node) in columns]
+ columns.append((stop_idx - start_idx, None))
+ # Go in reverse order, so that indices are not invalidated
+ for i, (idx, node) in enumerate(columns):
+ newcol = wrap_elements_in_tag(newrow, idx, columns[i - 1], 'div')
+ newcol.set('class', 'col')
+def _sanitise_styleinfo(styleinfo, headingnames):
+ # Replace lists with sets
+ for k, v in styleinfo.items():
+ # Ensure that all sections have an entry in styleinfo
+ for level, name in headingnames:
+# The user is allowed to assign presentation to different sections.
+# The sections are identified by headings, so that formatting will be
+# consistent with the logical structure of the document.
+# This imposes a certain div structure on the HTML. Consider the following
+# If the user wants 'Section 1' in a blue, bordered box, the only
+# (practical) way to do it in CSS is to create a div around *all* of
+# section 1 (including Section 1.1 and Section 1.2) and apply a CSS
+# class to it. The div structures must therefore nest according to the
+# logical structure of the document.
+# If the user decided that column 1 should contain Section 1 up to
+# Section 1.1, and that column 2 should contain Section 1.2 up to
+# Section 2, this would require a div structure incompatible with the
+# above. Thus the column layout is limited by the logical structure of
+def wrap_elements_in_tag(parent, start_idx, stop_idx, tag):
+ Wrap elements in parent at indices [start_idx:stop_idx] with
+ newelem = ET.Element(tag)
+ group = parent[start_idx:stop_idx]
+ parent[start_idx:stop_idx] = [newelem]
def format_html(html, styleinfo):
# Ensure that the headings are well formed and the HTML is valid
headingnames = extract_headings(html)
+ styleinfo = _sanitise_styleinfo(styleinfo, headingnames)
+ # Strip existing div, otherwise we cannot format properly. If
+ # there are other block level elements that mess things up, we
+ # raise BadStructure later, but divs have so semantics so can just
cleanup(root, lambda t: t.tag != 'div')
# Get the heading nodes, decorated with the level of the heading
- headers = [(int(n.tag), n) for n in root.getiterator() if n.tag in headingdef]
+ headers = [(int(n.tag), n) for n in root.getiterator() if n.tag in headingdef]
# Cut the HTML up into sections
- for idx, (level, h) in enumerate(headers):
+ for idx, (level, name, h) in enumerate(headers):
# We can no longer assume that parent = root, because the divs
# we insert will change that. However, the divs we insert
# will keep sub-section headings on the same level.
# 'scope' of each section is from heading node to before the next
# heading with a level the same or higher
- nextnodes = [(l,n) for (l,n) in headers[idx+1:] if l <= level]
+ nextnodes = [(l,n) for (l,n) in headers[idx+1:] if l <= level]
# Bug in elementtree - throws AssertionError if we try
# to set a slice with [something:None]. So we use len()
# div already), just go to end
- group = parent[first_elem:last_elem]
- # Create a new div for them
- newdiv = ET.Element("div")
- # Replace original element
- parent[first_elem:last_elem] = [newdiv]
+ newdiv = wrap_elements_in_tag(parent, first_elem, last_elem, "div")
- classes = [s[6:] for s in styleinfo.get(name, ) if s.startswith("class:")]
+ classes = [_get_class(s) for s in styleinfo[name] if _is_class(s)]
newdiv.set("class", " ".join(classes))
+ section_nodes[name] = newdiv
- # TODO - store div for later processing
- # TODO - apply commands to divs
+ _apply_commands(root, section_nodes, styleinfo, headers)