Commits

Luke Plant committed e92f5d0

Lots more implementation of HTML cleaning

Comments (0)

Files changed (3)

semanticeditor/tests.py

 # -*- coding: utf-8 -*-
 
 from django.test import TestCase
-from semanticeditor.utils import extract_structure, InvalidHtml, IncorrectHeadings, format_html, parse, get_parent, get_index, BadStructure, TooManyColumns, NEWROW, NEWCOL, extract_presentation, get_structure, clean_html
-from semanticeditor.utils.presentation import PresentationInfo, PresentationClass, StructureItem, LayoutDetails
+from semanticeditor.utils import *
 
 PC = PresentationClass
 
 
     def test_existing_divs(self):
         html = "<div><foo><bar><fribble><div><div>Some text <p>para</p> some more</div><div> more <span> of </span> this stuff </div></div></fribble></bar></foo></div>"
-        outh = '<div class="row"><div><div><foo><bar><fribble>Some text <p>para</p> some more more <span> of </span> this stuff </fribble></bar></foo></div></div></div>'
+        outh = '<div class="row"><div><div><p><foo><bar><fribble><p>Some text para some more more <span> of </span> this stuff </p></fribble></bar></foo></p></div></div></div>'
         self.assertEqual(outh, format_html(html, {}))
 
     def test_add_css_classes(self):
         p = get_parent(t, n)
         self.assertEqual(1, get_index(p,n))
 
+    def test_eliminate_tag_1(self):
+        t = ET.fromstring("<a>Hello<b>Goodbye</b>End</a>")
+        eliminate_tag(t, 0)
+        self.assertEqual("<a>HelloGoodbyeEnd</a>", ET.tostring(t))
+
+    def test_eliminate_tag_2(self):
+        t = ET.fromstring("<a>Hello<b>Goodbye</b>Some<b>More</b>End</a>")
+        eliminate_tag(t, 0)
+        self.assertEqual("<a>HelloGoodbyeSome<b>More</b>End</a>", ET.tostring(t))
+
+    def test_eliminate_tag_3(self):
+        t = ET.fromstring("<a>Hello<b>Goodbye</b>Some<b>More</b>End</a>")
+        eliminate_tag(t, 1)
+        self.assertEqual("<a>Hello<b>Goodbye</b>SomeMoreEnd</a>", ET.tostring(t))
+
+    def test_eliminate_tag_4(self):
+        t = ET.fromstring("<a>Hello<b>Good<x>b</x><y>y</y>e</b>End</a>")
+        eliminate_tag(t, 0)
+        self.assertEqual("<a>HelloGood<x>b</x><y>y</y>eEnd</a>", ET.tostring(t))
+
+    def test_eliminate_tag_5(self):
+        t = ET.fromstring("<a>Hello<b>First <c>node</c></b>tail<b>Good<x>b</x><y>y</y>e</b>And<b>Stuff</b></a>")
+        eliminate_tag(t, 1)
+        self.assertEqual("<a>Hello<b>First <c>node</c></b>tailGood<x>b</x><y>y</y>eAnd<b>Stuff</b></a>", ET.tostring(t))
+
+
 
 class TestExtractPresentation(TestCase):
     def test_extract_presentation(self):
 		H2.cjk { font-family: "DejaVu Sans"; font-size: 14pt; font-style: italic }
 		H2.ctl { font-family: "DejaVu Sans"; font-size: 14pt; font-style: italic }
 	--&gt;
-	</style><p class="western">Global Café Bible
-study: <strong>Luke 6:46-49</strong></p><h2 class="western">Words and phrases</h2><table width="459" cellpadding="4"><col width="110"><col width="334"><tbody><tr><td><p class="western">torrent</p></td><td><p class="western">a violently fast stream of water</p></td></tr></tbody><p class="western"></p><h2 class="western">Questions</h2><p class="western"></p><p class="western">What does it mean for
-people to call Jesus “Lord, Lord”?</p></col>
+	</style><p class="western"><strong>My Café</strong></p><h2 class="western">Heading</h2><table width="459" cellpadding="4"><col width="110"><col width="334"><tbody><tr><td><p class="western">cell1</p></td><td><p class="western">cell2</p></td></tr></tbody><p class="western"></p><h2 class="western">Heading 2</h2><p class="western"></p><p class="western">Some “text”</p></col>
 """
     firefox_oowriter_output_1 = u"""
-<p>Global Caf&#233; Bible
-study: <strong>Luke 6:46-49</strong></p><h2>Words and phrases</h2><p>torrent</p><p>a violently fast stream of water</p><p/><h2>Questions</h2><p/><p>What does it mean for
-people to call Jesus &#8220;Lord, Lord&#8221;?</p>
+<p><strong>My Caf&#233;</strong></p><h2>Heading</h2><p>cell1</p><p>cell2</p><h2>Heading 2</h2><p>Some “text”</p>
 """
 
+    def assertEqualClean(self, input, output):
+        """
+        Assert that expected output is the same as the input cleaned
+        """
+        # Do a pretty_print to make error messages nicer
+        actual_output = clean_html(input)
+        s1 = pretty_print(output).strip()
+        s2 = pretty_print(actual_output).strip()
+        try:
+            self.assertEqual(s1, s2)
+        except:
+            print
+            print s1
+            print
+            print s2
+            raise
+
     def test_cleanup_safari_1(self):
-        self.assertEqual(self.safari_output_1, clean_html(self.safari_example_1))
+        self.assertEqualClean(self.safari_example_1,
+                              self.safari_output_1)
 
     def test_cleanup_firefox_oowriter_1(self):
-        output = clean_html(self.firefox_oowriter_example_1)
-        # Check that output is well formed.
-        parse(output, clean=False)
-        self.assertEqual(self.firefox_oowriter_output_1, output)
+        self.assertEqualClean(self.firefox_oowriter_example_1,
+                              self.firefox_oowriter_output_1)
 
     def test_cleanup_tables(self):
-        self.assertEqual("<p>Hello</p><p>P2</p>", clean_html("<table><tbody><tr><td><p>Hello</p></td></tr></tbody><p>P2</p></table>"));
+        self.assertEqualClean("<table><tbody><tr><td><p>Hello</p></td></tr></tbody><p>P2</p>text</table>",
+                              "<p>Hello</p><p>P2</p><p>text</p>");
 
+    def test_toplevel_text(self):
+        # Make sure that text at the top level is inside some tag
+        self.assertEqualClean("test", "<p>test</p>")
+
+    def test_div_to_p(self):
+        self.assertEqualClean("<div>Foo</div>", "<p>Foo</p>")
+
+    def test_nested_p(self):
+        self.assertEqualClean("<p>Hello <p>How are <p>you</p></p> today</p>",
+                              "<p>Hello </p><p>How are </p><p>you</p><p> today</p>")
+
+    def test_br_to_p(self):
+        self.assertEqualClean("This is<br /><br />a test",
+                              "<p>This is</p><p>a test</p>")
+
+    def test_p_in_li(self):
+        self.assertEqualClean("<ul><li><p>An item</p></li></ul>",
+                              "<ul><li>An item</li></ul>")

semanticeditor/utils/etree.py

     else:
         if level and (not elem.tail or not elem.tail.strip()):
             elem.tail = i
+
+def eliminate_tag(parent, index):
+    """
+    Eliminates the tag from node at index 'index' from the parent.  The contents
+    are pulled up into parent.
+    """
+    elem = parent[index]
+
+    first = index == 0
+    last = index == len(parent) - 1
+
+    # 'text'
+    if first:
+        # 'text' merges with parents.
+        parent.text = textjoin(parent.text, elem.text)
+    else:
+        # 'text' merges with tail of previous sibling
+        prev = parent[index-1]
+        prev.tail = textjoin(prev.tail, elem.text)
+    # 'tail'
+    if len(elem.getchildren()) > 0:
+        # tail always goes on last child's tail
+        elem[-1].tail = textjoin(elem[-1].tail, elem.tail)
+    else:
+        if first:
+            # tail goes on parents text
+            parent.text = textjoin(parent.text, elem.tail)
+        elif last: # (and not first)
+            prev = parent[index-1]
+            prev.tail = textjoin(prev.tail, elem.tail)
+        else:
+            next = parent[index+1]
+            next.text = textjoin(elem.tail, next.text)
+
+    # Replace element with its children
+    parent.remove(elem)
+    for c in reversed(elem.getchildren()):
+        parent.insert(index, c)
+

semanticeditor/utils/presentation.py

 from lxml import etree as ET
 from lxml.html import HTMLParser
 from pyquery import PyQuery as pq
-from semanticeditor.utils.etree import cleanup, flatten, get_parent, get_depth, get_index, indent
+from semanticeditor.utils.etree import cleanup, flatten, get_parent, get_depth, get_index, indent, eliminate_tag
 from semanticeditor.utils.datastructures import struct
 import re
 
     of dirty user provided HTML
     """
     if clean:
-        tree = ET.fromstring('<html><body>' + fixentities(content) + '</body></html>', parser=HTMLParser())
+        tree = ET.fromstring(u'<html><body>' + fixentities(content) + u'</body></html>', parser=HTMLParser())
         clean_tree(tree)
     else:
         try:
-            tree = ET.fromstring("<html><body>" + fixentities(content) + "</body></html>")
+            tree = ET.fromstring(u"<html><body>" + fixentities(content) + u"</body></html>")
         except ET.XMLSyntaxError, e:
             raise InvalidHtml("HTML content is not well formed.")
     return tree
 
-# NB: ElementTree is bizarre - after parsing some UTF-8 bytestrings,
-# it will then return nodes that are 'str's if the text is all ASCII,
-# otherwise 'unicode's (having correctly interpreted the UTF-8).  When
-# serialising to JSON, this works out OK actually, so we leave it as
-# is for the moment.
+# NB: ElementTree is bizarre - after parsing some UTF-8 bytestrings, it will
+# then return nodes that are 'str's if the text is all ASCII, otherwise
+# 'unicode's (having correctly interpreted the UTF-8).  When serialising to
+# JSON, this works out OK actually, so we leave it as is for the moment.
+
+def pretty_print(content):
+    t = parse(content)
+    indent(t)
+    return _html_extract(t)
 
 ### Semantic editor functionality ###
 
     # affect tests.
     layout_strategy = get_layout_details_strategy()
     html = layout_strategy.extract_pre_parse_hacks(html)
-    root = parse(html)
+    root = parse(html, clean=False) # it's important we don't clean.
     root = layout_strategy.extract_post_parse_hacks(root)
     structure = get_structure(root)
     structure = layout_strategy.extract_structure_hacks(structure)
 def _replace_with_children(e):
     e.replaceWith(e.find('*'))
 
+def _empty_text(x):
+    return x is None or x.strip() == ""
+
+def _promote_child_text(elem, tag):
+    """
+    Ensure any leading or trailing text directly as a child of elem is wrapped
+    in a tag.
+    """
+    if not _empty_text(elem.text):
+        newtag = ET.Element(tag)
+        newtag.text = elem.text
+        elem.insert(0, newtag)
+        elem.text = None
+
+    if len(elem) > 0 and not _empty_text(elem[-1].tail):
+        newtag = ET.Element(tag)
+        newtag.text = elem[-1].tail
+        elem[-1].tail = None
+        elem.append(newtag)
+
+def _clean_nested(elem):
+    for idx, child in reversed(list(enumerate(elem.getchildren()))):
+        # (do it reversed so that indexes never change as we mutate children)
+        _clean_nested(child)
+        if child.tag == 'p' and elem.tag == 'p':
+            eliminate_tag(elem, idx)
+
+def _replace_block_elements(elem):
+    for child in elem.getchildren():
+        if child.tag == 'div':
+            child.tag = 'p'
+        _replace_block_elements(child)
+
 def clean_tree(root):
     """
     Cleans dirty HTML from an ElementTree
     """
+    initial_html = _html_extract(root)
+    body = root[0] # <html><body>
+    # If there is text directly in body, it needs wrapping in a block element.
+    _promote_child_text(body, 'p')
+
+    # First replace divs
+    _replace_block_elements(body)
+
+    # Deal with nested 'p's and other elements.
+    _clean_nested(body)
+
     doc = pq(root)
     doc('*').each(_clean_elem)
     doc('style').remove()
     doc('col').remove()
+
     for x in ['table', 'tbody', 'thead', 'tr', 'td']:
         doc(x).each(_replace_with_children)
 
+    def pull_up(n):
+        p = get_parent(body, n)
+        i = get_index(p, n)
+        eliminate_tag(p, i)
+
+    for n in doc('li p:only-child'):
+        pull_up(n)
+
+    doc('br + br').remove()
+    doc('p + br').remove()
+    doc('p:empty').remove()
+
+    # Removed elements can give problems which need to be fixed again.  We keep
+    # iterating through this until we get the same answer!
+    output_html = _html_extract(root)
+    if initial_html == output_html:
+        return
+    else:
+        clean_tree(root)
+
 def clean_html(html):
     tree = parse(html, clean=True)
     return _html_extract(tree)