Commits

Luke Plant committed 0bc550b

Switched 'extract_structure' to using HTMLParser and being tolerant of bad input

Comments (0)

Files changed (2)

semanticeditor/tests.py

 
 class TestExtractStructure(TestCase):
     def test_extract_structure(self):
-        self.assertEqual([(s.level, s.sect_id, s.name, s.tag) for s in extract_structure("""
+        self.assertEqual([(s.level, s.sect_id, s.name, s.tag) for s in extract_structure(u"""
 <h1>Heading <b>with </b><i>embedded <em>stuff</em> in</i> it</h1> Hmm
 <p>A long paragraph with some actual content</p>
 <h2>A sub heading</h2>
          ])
 
     def test_extract_structure_missing(self):
-        self.assertEqual(extract_structure("Hello"), [])
-
-    def test_rejects_bad_html(self):
-        self.assertRaises(InvalidHtml, extract_structure, "<h1>Foo")
+        self.assertEqual(extract_structure(""), [])
 
     def test_rejects_higher_headings_later(self):
         """

semanticeditor/utils/presentation.py

             # the first to appear in the document.
             # It is also adjusted so that nested items (e.g. p in blockquote)
             # appear to be nested.
-            nesting_level = get_depth(root, n) - 1
+            nesting_level = get_depth(root, n) - 2
             retval.append(StructureItem(level=nesting_level + level - first_heading_level + 1,
                                         sect_id=sect_id,
                                         name=name,
     returns a list of tuples containing (level, name, tag)
     """
     # Parse
-    tree = parse(content)
+    tree = parse(content, clean=True)
     structure = get_structure(tree, assert_structure=True)
     return structure
 
     """
     return sum(_layout_column_width(c) for c in row.columns)
 
+def is_root(node):
+    return node.tag == 'html' or node.tag == 'body'
+
 def _find_layout_commands(root, structure, styleinfo):
     # Layout commands are not stored against normal sections,
     # but have their own entry in the section list, using an id
             sect = sect_dict.get(real_sect_id)
             if sect is not None:
                 parent = get_parent(root, sect.node)
-                if parent is not root:
+                if not is_root(parent):
                     raise BadStructure("Section \"%(name)s\" is not at the top level of "
                                        "the document, and therefore cannot have a column "
                                        "structure applied to it.  Please move the 'New row' "
             sect = sect_dict.get(real_sect_id)
             if sect is not None:
                 parent = get_parent(root, sect.node)
-                if parent is not root:
+                if not is_root(parent):
                     raise BadStructure("Section \"%(name)s\" is not at the top level of "
                                        "the document, and therefore cannot have a column "
                                        "structure applied to it.  Please move the 'New column' "
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.