Luke Plant avatar Luke Plant committed 481ea15 Merge

Merged bug fix

Comments (0)

Files changed (3)

semanticeditor/tests.py

         outh = "<div class=\"row\"><div><div><h1>1</h1><p>para 1</p><h2>2</h2></div></div></div>"
         self.assertEqual(outh, format_html(html, {'h1_1':[NEWROW]}))
 
+    def test_format_pre(self):
+        html = "<pre>This\r\nis\r\na\r\ntest</pre>"
+        # check that format_html doesn't do anything nasty inside the pre
+        html2 = format_html(html, {})
+        pres, html3 = extract_presentation(html2)
+        self.assertEqual(html, html3)
+
 class TestHacks(TestCase):
     def test_div_format_hack(self):
         html = '<p>Test</p>'

semanticeditor/utils/presentation.py

         return tree
 
 ### Parsing ###
-import htmlentitydefs
-def fixentities(htmltext):
-    # replace HTML character entities with numerical references
-    # note: this won't handle CDATA sections properly
-    def repl(m):
-        entity = htmlentitydefs.entitydefs.get(m.group(1).lower())
-        if not entity:
-            return m.group(0)
-        elif len(entity) == 1:
-            if entity in "&<>'\"":
-                return m.group(0)
-            return "&#%d;" % ord(entity)
-        else:
-            return entity
-    return re.sub("&(\w+);?", repl, htmltext)
-
 def parse(content, clean=False):
     """
     Parses the HTML provided into an ElementTree.
     If 'clean' is True, lax parsing is done, the tree is cleaned
     of dirty user provided HTML
     """
+    # We also use HTMLParser for 'strict', because the XML parser seems to eliminate
+    # '\r' for some reason.
+    tree = ET.fromstring(u'<html><body>' + content + u'</body></html>', parser=HTMLParser())
     if clean:
-        tree = ET.fromstring(u'<html><body>' + fixentities(content) + u'</body></html>', parser=HTMLParser())
         clean_tree(tree)
-    else:
-        try:
-            tree = ET.fromstring(u"<html><body>" + fixentities(content) + u"</body></html>")
-        except ET.XMLSyntaxError, e:
-            raise InvalidHtml("HTML content is not well formed.")
     return tree
 
 # NB: ElementTree is bizarre - after parsing some UTF-8 bytestrings, it will
 def _html_extract(root):
     if len(root) == 0 and root.text is None and root.tail is None:
         return ''
-    return ET.tostring(root).replace('<html>','').replace('</html>','').replace('<body>','').replace('</body>', '').replace("<head/>","")
+    return ET.tostring(root).replace('<html>','').replace('</html>','').replace('<body>','').replace('</body>', '').replace("<head/>","").replace("&#13;", "\r")
 
 def _strip_presentation(tree):
     cleanup(tree, lambda t: t.tag == 'div')

semanticeditor/views.py

 @json_view
 def clean_html_view(request):
     html = request.POST.get('html', '')
-    print clean_html(html)
     return graceful_errors(AllUserErrors, lambda: dict(html=clean_html(html)))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.