Luke Plant avatar Luke Plant committed 4e5cd8a

Implemented parsing of HTML

Comments (0)

Files changed (2)

semanticeditor/tests.py

+# -*- coding: utf-8 -*-
+
+from django.test import TestCase
+from semanticeditor.utils import extract_headings, InvalidHtml, IncorrectHeadings
+
+class TestExtract(TestCase):
+    def test_extract_headings(self):
+        self.assertEqual(extract_headings("""
+<h1>Heading <b>with </b><i>embedded <em>stuff</em> in</i> it</h1> Hmm<p>A paragraph</p>
+<h2>A sub heading</h2><p>Another para</p>
+<h3>level 3</h3>
+<h4>level 4</h4>
+<h5>level 5</h5>
+<h6>level 6</h6>
+<h1>Heading two</h1>
+"""), [(1, "Heading with embedded stuff in it"),
+       (2, "A sub heading"),
+       (3, "level 3"),
+       (4, "level 4"),
+       (5, "level 5"),
+       (6, "level 6"),
+       (1, "Heading two"),
+       ])
+
+    def test_rejects_bad_html(self):
+        self.assertRaises(InvalidHtml, extract_headings, "<h1>Foo")
+
+    def test_rejects_headings_not_start_at_1(self):
+        self.assertRaises(IncorrectHeadings, extract_headings, "<h2>Hello</h2>")
+
+    def test_rejects_improper_headings(self):
+        self.assertRaises(IncorrectHeadings, extract_headings, "<h1>Hello</h1><h3>Bad heading</h3>")
+
+    def test_rejects_duplicate_headings(self):
+        self.assertRaises(IncorrectHeadings, extract_headings, "<h1>Hello</h1><h2>Hello</h2>")

semanticeditor/utils.py

+"""
+Utilities for manipulating the content provided by the user.
+"""
+
+from elementtree import ElementTree as ET
+from xml.parsers import expat
+
+class InvalidHtml(ValueError):
+    pass
+
+class IncorrectHeadings(ValueError):
+    pass
+
+def extract_headings(content):
+    """
+    Extracts H1, H2, etc headings, and returns a list of tuples
+    containing (level, name)
+    """
+    try:
+        tree = ET.fromstring("<html>" + content + "</html>")
+    except expat.ExpatError, e:
+        raise InvalidHtml("HTML content is not well formed.")
+
+    headingdef = ['h1','h2','h3','h4','h5','h6']
+
+    # Parse
+    nodes = [n for n in tree.getiterator() if n.tag in headingdef]
+    headings = [(int(h.tag[1]), flatten(h)) for h in nodes]
+
+    # Check ordering
+    if len(headings) == 0:
+        return headings
+
+    if headings[0][0] > 1:
+        raise IncorrectHeadings("First heading must be H1.")
+
+    # Headings should decrease or monotonically increase
+    # and they should have unique names
+    lastnum = 0
+    names = {}
+    for num, name in headings:
+        if num > lastnum + 1:
+            raise IncorrectHeadings('Heading "%(name)s" is level H%(foundnum)d, but it should be level H%(rightnum)d or less'  % dict(name=name,foundnum=num,rightnum=lastnum))
+        lastnum = num
+        if name in names:
+            raise IncorrectHeadings('There are more than one headings with the name "%s".' % name)
+        names[name] = True
+
+    return headings
+
+
+def flatten(node):
+    """
+    Pulls out all text in this node and its children.
+    """
+    # Use flatten_helper, but don't include the
+    # tail for the very top level element
+    return flatten_helper(node, include_tail=False)
+
+def flatten_helper(node, include_tail=True):
+    if include_tail:
+        tail = node.tail or ''
+    else:
+        tail = ''
+    return node.text + ''.join(map(flatten_helper, node.getchildren())) + tail
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.