Commits

rbeezer committed 94b55b7

Remove accidental XML syntax from Sage code

Comments (0)

Files changed (1)

           where contents are XHTML, or un-delimited Sage code
         """
         import xml.dom.minidom as dom
-
         import re     # regular expressions for parsing
 
-        tree = dom.parse(html_name)
+        #  Using verbatim environments for Sage code
+        #  allows some XML escape codes to slip through
+        #  <,> are two obvious ones and easy to handle
+        #  The XML escape character, &, is trickier
+        #  We only protect against breaking character
+        #  codes like &#1234;  but not  codes like &lt;
+        #
+        #  Recognize when sage cells begin or end
+        sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
+        sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
+        #  Ampersands that don't begin a character code
+        ampersand_pattern = re.compile( r'(&(?!#[0-9]*;))' )
+
+        sage_block = False
+        xmlcontent = []
+        html_file = open(html_name,'r')
+        for aline in html_file.readlines():
+            if sage_block and re.match(sage_end_pattern, aline):
+                sage_block = False
+            elif sage_block:
+                pieces = re.split( ampersand_pattern, aline )
+                if len(pieces)>1:
+                    for i in range(len(pieces)):
+                        if pieces[i] == '&':
+                            pieces[i] = r'&#38;'
+                    aline = ''.join(pieces)
+                aline = aline.replace('<', r'&#60;')
+                aline = aline.replace('>', r'&#62;')
+            elif not(sage_block) and re.match(sage_start_pattern, aline):
+                sage_block = True
+            xmlcontent.append(aline)
+
+        # Can now parse valid XHTML
+        tree = dom.parseString( ''.join(xmlcontent) )
 
         # Find a title (all of them really)
         titles = []