Luke Plant avatar Luke Plant committed 9879dde

Switched 'format_html' to be tolerant and clean bad HTML

Comments (0)

Files changed (2)

semanticeditor/tests.py

 
 class TestHacks(TestCase):
     def test_div_format_hack(self):
-        html = '<p class="div">Test</p>'
+        html = '<p>Test</p>'
         outh = '<div class=\"row\"><div><div><div class="div">Test</div></div></div></div>'
-        self.assertEqual(outh, format_html(html, {}))
+        self.assertEqual(outh, format_html(html, {'p_1':[PC('div')]}))
 
     def test_div_extract_hack(self):
         html = '<div class="div">Test</div>'
     safari_example_1 = """
 <p style="margin-top: 0px; margin-right: 0px; margin-bottom: 0.8em; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; font-size: 0.9em; line-height: 1.4em; "><strong style="font-weight: bold; ">Formerly: Community Health Sciences Research (CHSR) IRG</strong></p><p style="margin-top: 0px; margin-right: 0px; margin-bottom: 0.8em; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; font-size: 0.9em; line-height: 1.4em; ">The Clinical Epidemiology IRG aims to undertake research that makes an important difference to patient care. Our work is divided into two broad research areas:</p><h4 style="color: rgb(153, 0, 51); margin-top: 0px; margin-right: 0px; margin-bottom: 0.25em; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; font-size: 1.1em; line-height: 1.3em; "><strong style="font-weight: bold; ">Clinical and environmental epidemiology -</strong>&#160;including</h4><ul style="margin-top: 0px; margin-right: 0px; margin-bottom: 1.5em; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; line-height: 1.4em; font-size: 0.9em; "><li style="margin-top: 0px; margin-right: 0px;margin-bottom: 0.25em; margin-left: 20px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px;padding-left: 0px; ">mental health</li><li style="margin-top: 0px; margin-right: 0px; margin-bottom: 0.25em; margin-left: 20px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; ">child protection</li><li style="margin-top: 0px; margin-right: 0px; margin-bottom: 0.25em; margin-left: 20px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px;">cancer</li><li style="margin-top: 0px; margin-right: 0px; margin-bottom: 0.25em; margin-left: 20px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; ">environmental, economic and social risk factors</li></ul></span>
 """
-    safari_output_1 = """<p><strong>Formerly: Community Health Sciences Research (CHSR) IRG</strong></p><p>The Clinical Epidemiology IRG aims to undertake research that makes an important difference to patient care. Our work is divided into two broad research areas:</p><h4><strong>Clinical and environmental epidemiology -</strong>&#160;including</h4><ul><li>mental health</li><li>child protection</li><li>cancer</li><li>environmental, economic and social risk factors</li></ul>"""
+    safari_output_1 = """
+<p><strong>Formerly: Community Health Sciences Research (CHSR) IRG</strong></p><p>The Clinical Epidemiology IRG aims to undertake research that makes an important difference to patient care. Our work is divided into two broad research areas:</p><h4><strong>Clinical and environmental epidemiology -</strong>&#160;including</h4><ul><li>mental health</li><li>child protection</li><li>cancer</li><li>environmental, economic and social risk factors</li></ul>"""
 
     firefox_oowriter_example_1 = u"""
 <style type="text/css">
 study: <strong>Luke 6:46-49</strong></p><h2 class="western">Words and phrases</h2><table width="459" cellpadding="4"><col width="110"><col width="334"><tbody><tr><td><p class="western">torrent</p></td><td><p class="western">a violently fast stream of water</p></td></tr></tbody><p class="western"></p><h2 class="western">Questions</h2><p class="western"></p><p class="western">What does it mean for
 people to call Jesus “Lord, Lord”?</p></col>
 """
-    firefox_oowriter_output_1 = u"""<p>Global Caf&#233; Bible
+    firefox_oowriter_output_1 = u"""
+<p>Global Caf&#233; Bible
 study: <strong>Luke 6:46-49</strong></p><h2>Words and phrases</h2><table width="459" cellpadding="4"><col width="110"/><col width="334"/><tbody><tr><td><p>torrent</p></td><td><p>a violently fast stream of water</p></td></tr></tbody><p/><h2>Questions</h2><p/><p>What does it mean for
 people to call Jesus &#8220;Lord, Lord&#8221;?</p>
 </table>"""

semanticeditor/utils/presentation.py

     of dirty user provided HTML
     """
     if clean:
-        tree = ET.fromstring('<html>' + fixentities(content) + '</html>', parser=HTMLParser())
+        tree = ET.fromstring('<html><body>' + fixentities(content) + '</body></html>', parser=HTMLParser())
         clean_tree(tree)
     else:
         try:
-            tree = ET.fromstring("<html>" + fixentities(content) + "</html>")
+            tree = ET.fromstring("<html><body>" + fixentities(content) + "</body></html>")
         except ET.XMLSyntaxError, e:
             raise InvalidHtml("HTML content is not well formed.")
     return tree
     The dictionary has keys which are the ids of sections,
     and values which are lists of CSS classes or special commands.
     """
+    #import pdb
+    #pdb.set_trace()
     layout_strategy = get_layout_details_strategy()
     html = layout_strategy.format_pre_parse_hacks(html, styleinfo)
-    root = parse(html)
+    root = parse(html, clean=True)
     root = layout_strategy.format_post_parse_hacks(root, styleinfo)
     structure = get_structure(root, assert_structure=True)
     structure = layout_strategy.format_structure_hacks(structure, styleinfo)
     sect_dict = dict((si.node, si) for si in structure)
 
     # Build Layout
-    for node in root.getchildren():
+    children = root.getchildren()
+    if children and children[0].tag == 'body':
+        children = children[0].getchildren()
+
+    for node in children:
         si = sect_dict.get(node)
 
         if si:
                                  dict(max=max_cols, name=sect.name))
 
 def _render_layout(layout, layout_strategy):
-    root = ET.fromstring("<html></html>")
+    docroot = ET.fromstring("<html><body></body></html>")
+    root = docroot.getchildren()[0] # body
     for row in layout.rows:
         # Row
         logical_column_count = _layout_column_count(row)
 
             logical_column_num += _layout_column_width(col)
         root.append(rowdiv)
-    return root
+    return docroot
 
 def preview_html(html, pres):
     root, structure = format_html(html, pres, return_tree=True)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.