Commits

Luke Plant committed 79458b1

Implemented column formatting

Comments (0)

Files changed (2)

semanticeditor/tests.py

 # -*- coding: utf-8 -*-
 
 from django.test import TestCase
-from semanticeditor.utils import extract_headings, InvalidHtml, IncorrectHeadings, format_html, parse, get_parent, get_index
+from semanticeditor.utils import extract_headings, InvalidHtml, IncorrectHeadings, format_html, parse, get_parent, get_index, BadStructure, NEWROW, NEWCOL
 
 class TestExtract(TestCase):
     def test_extract_headings(self):
         self.assertEqual(outh, format_html(html, {'Hello you':['class:myclass'],
                                                   'Hi':['class:c1', 'class:c2']}))
 
+    def test_sanity_check_sections(self):
+        html = "<h1>Hello</h1><blockquote><h2>Hi</h2></blockquote>"
+        self.assertRaises(BadStructure, format_html, html, {})
+
+    def test_columns_1(self):
+        html = "<h1>1</h1><p>para 1</p><h1>2</h1><h1>3</h1>"
+        outh = "<div class=\"row2col\"><div class=\"col\"><div><h1>1</h1><p>para 1</p></div></div><div class=\"col\"><div><h1>2</h1></div><div><h1>3</h1></div></div></div>"
+        self.assertEqual(outh, format_html(html, {'1':[NEWROW],
+                                                  '2':[NEWCOL]}))
+
+    def test_columns_missing_newrow(self):
+        html = "<h1>1</h1><p>para 1</p><h1>2</h1><h1>3</h1>"
+        self.assertRaises(BadStructure, format_html, html, {'2':[NEWCOL]})
+
+
 class TestElementTreeUtils(TestCase):
     def test_get_parent(self):
         """

semanticeditor/utils.py

 class IncorrectHeadings(ValueError):
     pass
 
+class BadStructure(ValueError):
+    pass
 
 headingdef = ['h1','h2','h3','h4','h5','h6']
+NEWROW = 'command:newrow'
+NEWCOL = 'command:newcolumn'
 
+def _is_command(x):
+    return x.startswith('command:')
+
+def _get_class(x):
+    return x[6:]
+
+def _is_class(x):
+    return x.startswith('class:')
 
 def parse(content):
     try:
     # First, all h1, h2 etc tags will be children of the root.
     # remove_tag should have ensured that, otherwise we will be unable
     # to cut the HTML into sections.
-    for level, h in headers:
+    for level, name, h in headers:
         parent = get_parent(root, h)
-        # TODO: nicer assert
-        assert parent is root
+        if parent is not root:
+            raise BadStructure("Section heading \"%(name)s\" is not at the top level of "
+                               "the document. This interferes with the ability to "
+                               "format the sections and apply columns. "
+                               "Please move the heading out of the '%(element)s'"
+                               " element that contains it." % dict(name=name, element=parent.tag))
+
+def _invert_dict(d):
+    return dict((v,k) for (k,v) in d.items())
+
+def _apply_commands(root, section_nodes, styleinfo, headers):
+    # Rules:
+    #  - No nesting of columns within columns
+    #  - Within a given row, newcolumn must be applied to
+    #    divs that are at the same level.
+    #  - No columns allowed if newrow has not been started.
+
+    # Headers has the sections in document order
+    sections = [(level, name, section_nodes[name])
+                for level, name, n in headers]
+
+    # Inverted dict
+    known_nodes = _invert_dict(section_nodes)
+
+    # Preprocess:
+    #  - insert 'newcolumn' on everything that has 'newrow'
+    for level, name, hn in headers:
+        if NEWROW in styleinfo[name]:
+            styleinfo[name].add(NEWCOL)
+
+    _add_rows_and_columns(root, known_nodes, styleinfo)
+    # TODO: due to HTML/CSS quirks, we may need to add an empty <div
+    # class="rowclear"> after every <div class="row">
+
+def _add_rows_and_columns(topnode, known_nodes, styleinfo):
+    cur_row_start = None
+    cur_col = None
+    children = topnode.getchildren()
+    for idx, node in enumerate(children):
+        name = known_nodes.get(node)
+        if name is None:
+            # If not a section node, it cannot contain sections.
+            # or have commands
+            continue
+        commands = styleinfo[name]
+
+        if NEWROW in commands:
+            if cur_row_start is not None:
+                # The previous row is finished
+                _apply_row_col_divs(topnode, cur_row_start_idx, idx, columns)
+
+            # start a new row
+            cur_row_start = node
+            cur_row_start_idx = idx
+            columns = []
+
+        if NEWCOL in commands:
+            if cur_row_start is None:
+                raise BadStructure("'New column' command was found on section "
+                                   "'%(name)s' without an appropriate 'new row' "
+                                   "command before it. " % dict(name=name))
+            else:
+                columns.append((idx, node))
+
+        if not cur_row_start:
+            # Rows/columns can only be added within the same level of nesting
+            # of the HTML document.  This means we do not need to recurse if
+            # we have started adding rows/columns.
+            _add_rows_and_columns(node, known_nodes, styleinfo)
+            # However, it would be good to recurse and check that no
+            # NEWROW/COL commands were found, and warn the user if
+            # they are.
+
+        # If we are at last node, and are still in a row, there won't
+        # be a NEWROW command, so we have to close implicitly,
+        # including the current node in the row (hence idx + 1).
+        if idx == len(children) - 1 and cur_row_start is not None \
+                and len(columns) > 0:
+                _apply_row_col_divs(topnode, cur_row_start_idx, idx + 1, columns)
+
+
+def _apply_row_col_divs(parent, start_idx, stop_idx, columns):
+    # Add the row
+    newrow = wrap_elements_in_tag(parent, start_idx, stop_idx, 'div')
+    newrow.set('class', 'row%dcol' % len(columns))
+
+    # Add the columns
+
+    # The idx in 'columns' are all out now, due to having pulled the
+    # nodes out. Fix them up, and add a dummy entry to provide the
+    # 'stop_idx' for the last column.
+    columns = [(idx - start_idx, node) for (idx, node) in columns]
+    columns.append((stop_idx - start_idx, None))
+
+    # Go in reverse order, so that indices are not invalidated
+    columns.reverse()
+    for i, (idx, node) in enumerate(columns):
+        if node is None:
+            # last dummy entry
+            continue
+        newcol = wrap_elements_in_tag(newrow, idx, columns[i - 1][0], 'div')
+        newcol.set('class', 'col')
+
+
+def _sanitise_styleinfo(styleinfo, headingnames):
+    # Replace lists with sets
+    out = {}
+    for k, v in styleinfo.items():
+        out[k] = set(v)
+
+    # Ensure that all sections have an entry in styleinfo
+    for level, name in headingnames:
+        if not name in out:
+            out[name] = set()
+
+    return out
+
+# == Formatting HTML ==
+#
+# The user is allowed to assign presentation to different sections.
+# The sections are identified by headings, so that formatting will be
+# consistent with the logical structure of the document.
+#
+# This imposes a certain div structure on the HTML.  Consider the following
+# document:
+#
+# - H1 - Section 1
+#   - H2 - Section 1.1
+#   - P
+#   - H2 - Section 1.2
+# - H1 - Section 2
+#   etc
+#
+# If the user wants 'Section 1' in a blue, bordered box, the only
+# (practical) way to do it in CSS is to create a div around *all* of
+# section 1 (including Section 1.1 and Section 1.2) and apply a CSS
+# class to it. The div structures must therefore nest according to the
+# logical structure of the document.
+#
+# If the user decided that column 1 should contain Section 1 up to
+# Section 1.1, and that column 2 should contain Section 1.2 up to
+# Section 2, this would require a div structure incompatible with the
+# above. Thus the column layout is limited by the logical structure of
+# the document.
+
+def wrap_elements_in_tag(parent, start_idx, stop_idx, tag):
+    """
+    Wrap elements in parent at indices [start_idx:stop_idx] with
+    a new element
+    """
+    newelem = ET.Element(tag)
+    group = parent[start_idx:stop_idx]
+    newelem[:] = group
+    parent[start_idx:stop_idx] = [newelem]
+    return newelem
 
 def format_html(html, styleinfo):
     """
     # Ensure that the headings are well formed and the HTML is valid
     headingnames = extract_headings(html)
 
+    styleinfo = _sanitise_styleinfo(styleinfo, headingnames)
+
     root = parse(html)
 
-    # Strip existing divs
+    # Strip existing div, otherwise we cannot format properly.  If
+    # there are other block level elements that mess things up, we
+    # raise BadStructure later, but divs have so semantics so can just
+    # be removed.
     cleanup(root, lambda t: t.tag != 'div')
 
     # Get the heading nodes, decorated with the level of the heading
-    headers = [(int(n.tag[1]), n) for n in root.getiterator() if n.tag in headingdef]
+    headers = [(int(n.tag[1]), flatten(n), n) for n in root.getiterator() if n.tag in headingdef]
 
     _assert_sane_sections(root, headers)
 
+    section_nodes = {}
     # Cut the HTML up into sections
-    for idx, (level, h) in enumerate(headers):
-        name = flatten(h)
+    for idx, (level, name, h) in enumerate(headers):
         # We can no longer assume that parent = root, because the divs
         # we insert will change that.  However, the divs we insert
         # will keep sub-section headings on the same level.
 
         # 'scope' of each section is from heading node to before the next
         # heading with a level the same or higher
-        nextnodes = [(l,n) for (l,n) in headers[idx+1:] if l <= level]
+        nextnodes = [(l,n) for (l,nname,n) in headers[idx+1:] if l <= level]
         # Bug in elementtree - throws AssertionError if we try
         # to set a slice with [something:None]. So we use len()
         # instead of None
                 # div already), just go to end
                 last_elem = len(parent)
 
-        group = parent[first_elem:last_elem]
-
-        # Create a new div for them
-        newdiv = ET.Element("div")
-        newdiv[:] = group
-
-        # Replace original element
-        parent[first_elem:last_elem] = [newdiv]
+        newdiv = wrap_elements_in_tag(parent, first_elem, last_elem, "div")
 
         # Apply css styles
-        classes = [s[6:] for s in styleinfo.get(name, []) if s.startswith("class:")]
+        classes = [_get_class(s) for s in styleinfo[name] if _is_class(s)]
+        classes.sort()
         if classes:
             newdiv.set("class", " ".join(classes))
 
+        section_nodes[name] = newdiv
 
-        # TODO - store div for later processing
-
-    # TODO - apply commands to divs
+    _apply_commands(root, section_nodes, styleinfo, headers)
 
     return ET.tostring(root).replace('<html>','').replace('</html>','')