Commits

Luke Plant  committed be2fc99

Beginnings of format_html function

  • Participants
  • Parent commits 59d0d85

Comments (0)

Files changed (2)

File semanticeditor/tests.py

 # -*- coding: utf-8 -*-
 
 from django.test import TestCase
-from semanticeditor.utils import extract_headings, InvalidHtml, IncorrectHeadings
+from semanticeditor.utils import extract_headings, InvalidHtml, IncorrectHeadings, format_html
 
 class TestExtract(TestCase):
     def test_extract_headings(self):
 
     def test_rejects_duplicate_headings(self):
         self.assertRaises(IncorrectHeadings, extract_headings, "<h1>Hello</h1><h2>Hello</h2>")
+
+class TestCombine(TestCase):
+    def test_no_headings(self):
+        html = "<p>Test</p>"
+        self.assertEqual(html, format_html(html, {}))
+
+    def test_no_styling(self):
+        html = "<h1>Hello</h1><p>P 1</p><h2>Heading 2</h2>"
+        outh = "<div><h1>Hello</h1><p>P 1</p><div><h2>Heading 2</h2></div></div>"
+        self.assertEqual(outh, format_html(html, {}))
+
+    def test_existing_divs(self):
+        html = "<div><foo><bar><fribble><div><div>Some text <p>para</p> some more</div><div> more <span> of </span> this stuff </div></div></fribble></bar></foo></div>"
+        outh = "<foo><bar><fribble>Some text <p>para</p> some more more <span> of </span> this stuff </fribble></bar></foo>"
+        self.assertEqual(outh, format_html(html, {}))

File semanticeditor/utils.py

 headingdef = ['h1','h2','h3','h4','h5','h6']
 
 
+def parse(content):
+    try:
+        tree = ET.fromstring("<html>" + content + "</html>")
+    except expat.ExpatError, e:
+        raise InvalidHtml("HTML content is not well formed.")
+    return tree
+
+
 def extract_headings(content):
     """
     Extracts H1, H2, etc headings, and returns a list of tuples
     containing (level, name)
     """
     # Parse
-    try:
-        tree = ET.fromstring("<html>" + content + "</html>")
-    except expat.ExpatError, e:
-        raise InvalidHtml("HTML content is not well formed.")
-
+    tree = parse(content)
     nodes = [n for n in tree.getiterator() if n.tag in headingdef]
     headings = [(int(h.tag[1]), flatten(h)) for h in nodes]
 
     else:
         tail = ''
     return node.text + ''.join(map(flatten_helper, node.getchildren())) + tail
+
+def remove_tag(tree, tag):
+    """
+    Remove all tags named tag from the tree.
+    Their contents are pulled up into the parent.
+    Returns true if the tree was changed.
+    """
+
+    cont = True
+    while cont:
+        children = list(tree.getchildren())
+        changed = False
+        for idx, node in enumerate(children):
+            if node.tag == tag:
+                tree.remove(node)
+                # Insert its contents into parent.
+
+                # 'text' is appended to older sibling's 'tail'
+                #  or into 'text' of tree
+                ntail = node.tail or ''
+                ntext = node.text or ''
+
+                if idx == 0:
+                    ttext = tree.text or ''
+                    tree.text = ttext + ntext
+                else:
+                    ctail = children[idx-1].tail or ''
+                    children[idx-1].tail = ctail + ntext
+
+                # Nodes are inserted
+                for cidx, cnode in enumerate(node.getchildren()):
+                    tree.insert(idx + cidx, cnode)
+
+                # 'tail' is prepended to younger sibling's 'text'
+                # or to 'tail' of tree
+                if idx == len(children) - 1:
+                    ttail = tree.tail or ''
+                    tree.tail = ntail + ttail
+                else:
+                    ctext = children[idx+1].text or ''
+                    children[idx+1].text = ntail + ctext
+
+                # Everything has changed, so we start again
+                changed = True
+                break
+
+        # if changed, we have to start over again.
+        cont = changed
+
+    # Recurse to children
+    for n in tree.getchildren():
+        remove_tag(n, tag)
+
+def format_html(html, styleinfo):
+    """
+    Formats the XHTML given using a dictionary of style information.
+    The dictionary has keys which are the names of headings,
+    and values which are lists of CSS classes or special commands.
+    Commands start with 'command:'
+    """
+    tree = parse(html)
+    # Strip existing divs
+    remove_tag(tree, 'div')
+
+    return ET.tostring(tree).replace('<html>','').replace('</html>','')
+