rbeezer avatar rbeezer committed e142009

Parsing with xml.dom.minidom

Comments (0)

Files changed (1)

 #! /usr/bin/env sage
-
+# Use sage to pick up notebook library
 
 class TeXtoSWS(object):
 
-    # Could compile some regular expressions 
-    # here as class variables 
     def __init__(self, input_dir=None ):
+        r"""
+        Discover as much as possible about files that were
+        output by tex4ht to a directory.
+
+        INPUT:
+
+        - ``input_dir`` - a directory that contains all of the
+        output from a run of tex4ht on a latex file.  This
+        directory should contain the associated graphics
+        files but we locate them later in the HTML sources.
+
+        OUTPUT:
+
+        Several items are set here.  One is a ``basename`` which
+        tex4ht will have derived from the original LaTeX source
+        file.  So if we begin with ``foo.tex`` all of the
+        files involved will begin with ``foo``.
+
+        Another item returned is a list of pairs.  The
+        second part of each pair is the filename for an
+        HTML file.  The first part of the pair is the
+        (relative) worksheet number for that file, with
+        counting starting at zero, and as a string (not
+        an integer). The pairs are sorted according to
+        the numerical value of this first string.
+
+        The directory where all these files live is recorded
+        as ``self._basename`` and based on the number of HTML
+        files discovered a ``_likely_format`` is set.
+        """
+        from os import listdir  # to inspect directory
+        import re               # to massage filenames
+
         if not input_dir:
             input_dir = './'
-        #############
-        from os import listdir  # to inspect directory
-        import re               # to massage filenames
-        print type(input_dir), "  ", input_dir
         directory = listdir(input_dir)
 
         # tex4ht builds a HTML files and a CSS file
         basename = cssfilename[:-4]
 
         # Find all html files
-        htmlfile_pattern = re.compile( basename + r'(li|)([0-9]*)(.html)' )
+        # $ matches end-of-string, avoids backups with tildes (Robert Marik)
+        htmlfile_pattern = re.compile( r'^' + basename + r'(li|)([0-9]*)(.html)$' )
         files = []
         for afile in directory:
             m = htmlfile_pattern.match(afile)
             if m:
-                number_string = m.group(2)
+                ws_number = m.group(2)
                 # Main HTML file does not get a number from tex4ht
                 # Fits best as worksheet 0
-                if number_string == '':
-                    number_string = '0'
-                files.append((number_string, afile))
+                if not ws_number:
+                    ws_number = '0'
+                files.append((ws_number, afile))
         files.sort(key=lambda f: int(f[0]))
-        ##############
-        ##############_basename, _files = self._explore_directory(input_dir)
-        self._basename = basename
-        self._files = files
-        # Could compile regular expresssions for _basename here
-        self._input_dir = input_dir
-        # self._likely_format = None
-        if len(self._files) == 1:
+        if len(files) == 1:
             self._likely_format = 'sws'
         else:
             self._likely_format = 'tar'
+        self._input_dir = input_dir
+        self._basename = basename
+        self._files = files
 
 
-    def _explore_directory(self, dir):
+    def _parse_tex4ht(self, html_name, linkbase=None):
         r"""
-        Discovers all files that were output by tex4ht to a directory.
+        Bust up tex4ht output into a
+
+        - title - a string
+        - graphics - a list of filenames
+        - cells - list of pairs ('plain'|'compute', <contents>)
+          where contents are XHTML, or un-delimited Sage code
+        """
+        import xml.dom.minidom as dom
+
+        import re     # regular expressions for parsing
+
+        tree = dom.parse(html_name)
+
+        # Find a title (all of them really)
+        titles = []
+        for e in tree.getElementsByTagName('title'):
+            for text in e.childNodes:
+                titles.append(text.data)
+
+        # Find SVG graphics from pgf/tikz placed by tex4ht
+        graphics = []
+        for e in tree.getElementsByTagName('object'):
+            if e.hasAttribute('data'):
+                graphics.append(e.getAttribute('data'))
+
+        # Mirror above to grab "regular" graphicx \includegraphics
+        for e in tree.getElementsByTagName('img'):
+            if e.hasAttribute('src'):
+                graphics.append(e.getAttribute('src'))
+
+        # Find and modify links in place
+        if linkbase:
+            link_pattern = re.compile( r'^'+linkbase+'(li|)([0-9]*)(.html)(.*)$' )
+            for e in tree.getElementsByTagName('a'):
+                attr = e.attributes
+                if e.hasAttribute('href'):
+                    url = e.getAttribute('href')
+                    m = link_pattern.match(url)
+                    if m:
+                        # Handle '' as 0 worksheet
+                        ws_number = m.group(2)
+                        if not ws_number:
+                            ws_number ='0'
+                        newlink = '../' + ws_number + '/' + m.group(4)
+                        # Change it here
+                        e.setAttribute('href', newlink)
+
+        # Ignore headers/footers by starting with body tag
+        # Collect text between compute cells
+        # Identify text cells with <sage>,</sage> tag
+        # as produced by custom configuration file for tex4ht
+        bodies = tree.getElementsByTagName('body')
+        thebody = bodies[0]
+        cells = []
+        content = []
+        for e in thebody.childNodes:
+            if e.nodeType == dom.Node.ELEMENT_NODE:
+                tag = e.tagName
+                if not(tag in ['script', 'noscript', 'sage']):
+                    content.append(e.toxml())
+                if tag == 'sage':
+                    cells.append(('plain', ''.join(content)))
+                    content=[]
+                    compute = e.toxml()[6:-7]  # strip off <sage>,</sage>
+                    cells.append(('compute', compute))
+        if content:
+            cells.append(('plain', ''.join(content)))
+        return titles[0], graphics, cells
+
+
+    def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None):
+        r"""
+        Create a single worksheet from a parsed tex4ht XHTML file.
 
         INPUT:
 
-        - ``dir`` - a directory that contains all of the
-        output from a run of tex4ht on a latex file.  This
-        directory should contain the associated graphics
-        files but we locate them later in the HTML sources.
-
-        OUTPUT:
-
-        Output is two items.  First is a ``basename`` which
-        tex4ht will have derived from the original LaTeX source
-        file.  So if we begin with ``foo.tex`` all of the
-        files involved will begin with ``foo``.
-
-        The second item returned is a list of pairs.  The
-        second part of each pair is the filename for an
-        HTML file.  The first part of the pair is the
-        (relative) worksheet number for that file, with
-        counting starting at zero, and as a string (not
-        an integer). The pairs are sorted according to
-        the numerical value of this first string.
-        """
-        from os import listdir  # to inspect directory
-        import re               # to massage filenames
-
-        directory = listdir(dir)
-
-        # tex4ht builds a HTML files and a CSS file
-        # Infer basename of project from single CSS file in directory
-        # Use this to find all HTML files
-        cssfiles = [afile for afile in directory if afile.endswith('.css')]
-        if len(cssfiles) != 1:
-            raise ValueError('no CSS file, or multiple CSS files in directory')
-        cssfilename = cssfiles[0]
-        basename = cssfilename[:-4]
-
-        # Find all html files
-        htmlfile_pattern = re.compile( basename + r'(li|)([0-9]*)(.html)' )
-        files = []
-        for afile in directory:
-            m = htmlfile_pattern.match(afile)
-            if m:
-                number_string = m.group(2)
-                # Main HTML file does not get a number from tex4ht
-                # Fits best as worksheet 0
-                if number_string == '':
-                    number_string = '0'
-                files.append((number_string, afile))
-        files.sort(key=lambda f: int(f[0]))
-        if basename+'.html' != files[0][1]:
-            raise ValueError('HTML and CSS file have different base names')
-        return basename, files
-
-
-    def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None):
-        r"""
-        Create a single worksheet from an HTML file.
-
-        INPUT:
-
-        - html_name - file name of HTML fiile
+        - html_name - file name of HTML file
         - css_name - an associated CSS file
         - nb - a notebook to host worksheet creation temporarily
         - user - the user directory for worksheets in this notebook
 
         Returns a worksheet in ``nb``.
         """
-        import re     # regular expressions for parsing
-        import shutil # file copy()
+        import shutil # file copy() to data directory
         from sagenb.notebook.notebook import Notebook
 
-        #  Compiled patterns for key indicators in tex4ht output
-        #  Maybe this should be done just once as class-level objects
-        #    as part of a "Converter" object
-        #
-        #  Pull Sage worksheet title line out of HTML header title
-        title_pattern = re.compile(r'(.*)<title>(.*)</title>(.*)')
-        #  Start by trashing most of header, look for end location
-        header_end_pattern = re.compile( r'</noscript>' )
-        #  Recognize when content ends, javascript begins
-        footer_start_pattern = re.compile( r'.*<script type="text/javascript" >.*' )
-        #  Recognize when sage cell begins
-        sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
-        #  Recognize when sage cell begins
-        sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
-        # Identify SVG graphics files being included
-        # e.g.  <object data="graphics-test-1.svg" width="
-        svg_graphics_pattern = re.compile( r'(.*)<object data="(.*).svg" width="(.*)' )
-        # Identify other graphics files being included
-        # e.g.  src="CSRST.png" alt="PIC"
-        other_graphics_pattern = re.compile( r'(.*)src="(.*)" alt="PIC"(.*)' )
-        # Recognize numbered links
-        ##  href="fcla-jsmath-latestli81.html#archetype.I">
-        if linkbase:
-            link_pattern = re.compile( r'(.*)(href="'+linkbase+')(li|)([0-9]*)(.html)(.*)' )
+        title, graphics, cells = self._parse_tex4ht(html_name, linkbase)
 
-        # Cell counter for numbered "id" tags
-        cellnumber = -1
+        # Link in CSS file as part of HTML version
+        # Add to filename list for data directory
+        content=[]
+        content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
+        graphics.append(css_name)
 
-        # Accumulate content of worksheet file in a list
-        # Accumlate graphics filenames in a list
-        content=[]
-        graphics=[]
-
-        # May never find a title, so this is default
-        title = ''
-
-        # State variables
-        doing_header = True
-        doing_body = False
-        doing_footer = False
-        doing_sage = False
-
-        # Link in CSS file, gets saved into worksheet data directory later
-        content.append( r'<link type="text/css" rel="stylesheet" href="'+css_name+r'" />' )
-
-        # Lots of this is old and can be re-written better
-        # especially with MatchObject's and groups
-        htmlfile = open(html_name, 'r')
-        for aline in htmlfile.readlines():
-            # In reverse order, so state changes don't fall into wrong locations
-            if doing_footer:
-                pass  # just drop all lines of footer
-            elif doing_body:
-                # inspect, collect graphics filenames first
-                if re.match( svg_graphics_pattern, aline ):
-                    base_name = re.sub( svg_graphics_pattern, r'\2', aline)
-                    graphics.append( base_name[:-1] + '.svg' )
-                if re.match( other_graphics_pattern, aline ):
-                    base_name = re.sub( other_graphics_pattern, r'\2', aline)
-                    graphics.append( base_name[:-1] )
-                # massage links, presumes at most one per line
-                # handle top-level worksheet as worksheet zero
-                if linkbase:
-                    m = re.match( link_pattern, aline)
-                    if m:
-                        if m.group(4) == '': ws_number = '0'
-                        else: ws_number = m.group(4)
-                        aline = m.group(1) + 'href="../' + ws_number + m.group(6) + '\n'
-                if re.match( footer_start_pattern, aline ):
-                    doing_body = False
-                    doing_footer = True
-                elif re.match( sage_start_pattern, aline ):
-                    doing_sage = True
-                    cellnumber += 1
-                    front = re.sub(sage_start_pattern, r'\1', aline)
-                    back = re.sub(sage_start_pattern, r'\2', aline)
-                    content.append(front + "{{{id=" + str(cellnumber) + "|" + back)
-                elif re.match( sage_end_pattern, aline ):
-                    doing_sage = False
-                    front = re.sub(sage_end_pattern, r'\1', aline)
-                    back = re.sub(sage_end_pattern, r'\2', aline)
-                    content.append(front[:-1] + "}}}\n" + back)
-                else:
-                    # have a plain content line
-                    content.append(aline)
-            elif doing_header:
-                if re.match(title_pattern, aline):
-                    # Grab the title, which tex4ht is pretty good about finding
-                    # Seems to have an extra newline?
-                    title = re.sub(title_pattern, r'\2', aline)[:-1]
-                if re.match(header_end_pattern, aline):
-                    doing_header = False
-                    doing_body = True
-        htmlfile.close( )
+        # Recognize cells, adorn compute cells
+        for c in cells:
+            if c[0] == 'plain':
+                content.append(c[1])
+            if c[0] == 'compute':
+                content.append('{{{' + c[1] + '}}}')
 
         # Build a worksheet in nb, and return it
         #   Set title
-        #   Place CSS in data directory
+        #   Place files in data directory
         #   Pack discovered graphics into data directory
         #   Data directory does not exist initially
         #   Side-effect of query is to build it
         W = nb.create_new_worksheet(title, user)
-        W.edit_save(''.join(content))
         datadir = W.data_directory()
-        shutil.copy("./"+css_name, datadir)
         for filename in graphics:
             shutil.copy("./"+filename, datadir)
+        W.edit_save(''.join(content).encode('ascii', 'xmlcharrefreplace'))
         nb.save_worksheet(W)
         return W
 
 
         INPUT:
 
-        - `basename` - a string this is the basename of the original
+        - `basename` - a string. This is the basename of the original
         LaTeX input file and the basename of the tex4ht output.
         So, for example, suppose your original file is foo.tex, and
         when processed by tex4ht it produces an HTML/jsMath file called
         """
         # We make a temporary notebook to work in
         # This is located in $HOME/.sage/temp/hostname/pid/
-        # Temporary directory gets deleted automatically (as process end?)
+        # Temporary directory gets deleted automatically (as process ends?)
         from sage.misc.misc import tmp_dir
         from sagenb.notebook.notebook import Notebook
         nbdir = tmp_dir() + 'converter.sagenb'
 
 
     def _create_tar_archive(self, basename):
-        # this is all ad-hoc
+        # this is all ad-hoc for testing
         # long-term the notebook might be temporary, or not
         # One approach would be a portable container, like sws
         # Other would be to install directly in a user's notebook
         nbdir = "/tmp/fcla.sagenb"
         nb=Notebook(nbdir)
         nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True)
-        cssfilename = self._basename + '.css'
+        cssfilename = basename + '.css'
         for _,htmlfilename in self._files:
             print "Converting: ", htmlfilename
             self._convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename)
         T.add('linear')
         T.close()
 
-    def convert(self, format = None, dir = None):
+    def convert(self, dir = None, format = None):
         r"""
         The one public method.
         """
             self._create_single_sws(self._basename)
         if format == 'tar':
             self._create_tar_archive(self._basename)
-
+        # testing parsing, not permanent
+        if format == 'xml-test':
+            print self._parse_tex4ht(self._basename+'.html', self._basename)
 
 ############################
 # Main
 
 t2s = TeXtoSWS()
 t2s.convert()
+## t2s.convert(format='xml-test')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.