Commits

Michael Elsdörfer committed 8c922ba

Support stripping tags that contain colons, and other non-alphabethic characters.

Comments (0)

Files changed (1)

 
 
 re_strip_tags = re.compile(
-    r"<\/?(\w+)((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>")
+    r"<\/?([^ >]+?)((\s+[^=>]+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>")
 
 def smart_strip_tags(text):
     """Return the given HTML with all tags stripped.
     br, div, p etc.
 
     It also uses an improved regular expression (it can handle '>'
-    inside attributes) from:
+    inside attributes), based on:
     http://kev.coolcavemen.com/2007/03/ultimate-regular-expression-for-html-tag-parsing-with-php/
 
     # TODO: could this be more solid by using HTMLParser (see comment
     for now, btw):
     >>> smart_strip_tags('abc<img \\nalt=">"\\n>def')
     u'abcdef'
+
+    Both tag and attribute values may contain non-alphabetic characters,
+    like a colon (used as a namespace prefix, in XML or MSWord exports).
+    >>> smart_strip_tags('abc<m:lMargin m:val="0" />def')
+    u'abcdef'
     """
 
     def repl(m):