rbeezer avatar rbeezer committed 9fabbbc

Refactor to use a converter class

Comments (0)

Files changed (1)

 #! /usr/bin/env sage
 
-def explore_directory(dir):
-    r"""
-    Discovers all files that were output by tex4ht to a directory.
 
-    INPUT:
+class TeXtoSWS(object):
 
-    - ``dir`` - a directory that contains all of the
-      output from a run of tex4ht on a latex file.  This
-      directory should contain the associated graphics
-      files but we locate them later in the HTML sources.
+    # Could compile some regular expressions 
+    # here as class variables 
+    def __init__(self, input_dir=None ):
+        if not input_dir:
+            input_dir = './'
+        #############
+        from os import listdir  # to inspect directory
+        import re               # to massage filenames
+        print type(input_dir), "  ", input_dir
+        directory = listdir(input_dir)
 
-    OUTPUT:
+        # tex4ht builds a HTML files and a CSS file
+        # Infer basename of project from single CSS file in directory
+        # Use this to find all HTML files
+        cssfiles = [afile for afile in directory if afile.endswith('.css')]
+        if len(cssfiles) != 1:
+            raise ValueError('no CSS file, or multiple CSS files in directory')
+        cssfilename = cssfiles[0]
+        basename = cssfilename[:-4]
 
-    Output is two items.  First is a ``basename`` which
-    tex4ht will have derived from the original LaTeX source
-    file.  So if we begin with ``foo.tex`` all of the
-    files involved will begin with ``foo``.
+        # Find all html files
+        htmlfile_pattern = re.compile( basename + r'(li|)([0-9]*)(.html)' )
+        files = []
+        for afile in directory:
+            m = htmlfile_pattern.match(afile)
+            if m:
+                number_string = m.group(2)
+                # Main HTML file does not get a number from tex4ht
+                # Fits best as worksheet 0
+                if number_string == '':
+                    number_string = '0'
+                files.append((number_string, afile))
+        files.sort(key=lambda f: int(f[0]))
+        ##############
+        ##############_basename, _files = self._explore_directory(input_dir)
+        self._basename = basename
+        self._files = files
+        # Could compile regular expresssions for _basename here
+        self._input_dir = input_dir
+        # self._likely_format = None
+        if len(self._files) == 1:
+            self._likely_format = 'sws'
+        else:
+            self._likely_format = 'tar'
 
-    The second item returned is a list of pairs.  The
-    second part of each pair is the filename for an
-    HTML file.  The first part of the pair is the
-    (relative) worksheet number for that file, with
-    counting starting at zero, and as a string (not
-    an integer). The pairs are sorted according to
-    the numerical value of this first string.
-    """
-    from os import listdir  # to inspect directory
-    import re               # to massage filenames
 
-    directory = listdir(dir)
+    def _explore_directory(self, dir):
+        r"""
+        Discovers all files that were output by tex4ht to a directory.
 
-    # tex4ht builds a HTML files and a CSS file
-    # Infer basename of project from single CSS file in directory
-    # Use this to find all HTML files
-    cssfiles = [afile for afile in directory if afile.endswith('.css')]
-    if len(cssfiles) != 1:
-        raise ValueError('no CSS file, or multiple CSS files in directory')
-    cssfilename = cssfiles[0]
-    basename = cssfilename[:-4]
+        INPUT:
 
-    # Find all html files
-    htmlfile_pattern = re.compile( basename + r'(li|)([0-9]*)(.html)' )
-    files = []
-    for afile in directory:
-        m = htmlfile_pattern.match(afile)
-        if m:
-            number_string = m.group(2)
-            # Main HTML file does not get a number from tex4ht
-            # Fits best as worksheet 0
-            if number_string == '':
-                number_string = '0'
-            files.append((number_string, afile))
-    files.sort(key=lambda f: int(f[0]))
-    return basename, files
+        - ``dir`` - a directory that contains all of the
+        output from a run of tex4ht on a latex file.  This
+        directory should contain the associated graphics
+        files but we locate them later in the HTML sources.
 
+        OUTPUT:
 
-def convert_one_file(html_name, css_name, nb, user, linkbase=None):
-    r"""
-    Create a single worksheet from an HTML file.
+        Output is two items.  First is a ``basename`` which
+        tex4ht will have derived from the original LaTeX source
+        file.  So if we begin with ``foo.tex`` all of the
+        files involved will begin with ``foo``.
 
-    INPUT:
+        The second item returned is a list of pairs.  The
+        second part of each pair is the filename for an
+        HTML file.  The first part of the pair is the
+        (relative) worksheet number for that file, with
+        counting starting at zero, and as a string (not
+        an integer). The pairs are sorted according to
+        the numerical value of this first string.
+        """
+        from os import listdir  # to inspect directory
+        import re               # to massage filenames
 
-    - html_name - file name of HTML fiile
-    - css_name - an associated CSS file
-    - nb - a notebook to host worksheet creation temporarily
-    - user - the user directory for worksheets in this notebook
-    - linkinfo - reserved for linked worksheets
+        directory = listdir(dir)
 
-    OUTPUT:
+        # tex4ht builds a HTML files and a CSS file
+        # Infer basename of project from single CSS file in directory
+        # Use this to find all HTML files
+        cssfiles = [afile for afile in directory if afile.endswith('.css')]
+        if len(cssfiles) != 1:
+            raise ValueError('no CSS file, or multiple CSS files in directory')
+        cssfilename = cssfiles[0]
+        basename = cssfilename[:-4]
 
-    Returns a worksheet in ``nb``.
-    """
-    import re     # regular expressions for parsing
-    import shutil # file copy()
-    from sagenb.notebook.notebook import Notebook
+        # Find all html files
+        htmlfile_pattern = re.compile( basename + r'(li|)([0-9]*)(.html)' )
+        files = []
+        for afile in directory:
+            m = htmlfile_pattern.match(afile)
+            if m:
+                number_string = m.group(2)
+                # Main HTML file does not get a number from tex4ht
+                # Fits best as worksheet 0
+                if number_string == '':
+                    number_string = '0'
+                files.append((number_string, afile))
+        files.sort(key=lambda f: int(f[0]))
+        if basename+'.html' != files[0][1]:
+            raise ValueError('HTML and CSS file have different base names')
+        return basename, files
 
-    #  Compiled patterns for key indicators in tex4ht output
-    #  Maybe this should be done just once as class-level objects
-    #    as part of a "Converter" object
-    #
-    #  Pull Sage worksheet title line out of HTML header title
-    title_pattern = re.compile(r'(.*)<title>(.*)</title>(.*)')
-    #  Start by trashing most of header, look for end location
-    header_end_pattern = re.compile( r'</noscript>' )
-    #  Recognize when content ends, javascript begins
-    footer_start_pattern = re.compile( r'.*<script type="text/javascript" >.*' )
-    #  Recognize when sage cell begins
-    sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
-    #  Recognize when sage cell begins
-    sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
-    # Identify SVG graphics files being included
-    # e.g.  <object data="graphics-test-1.svg" width="
-    svg_graphics_pattern = re.compile( r'(.*)<object data="(.*).svg" width="(.*)' )
-    # Identify other graphics files being included
-    # e.g.  src="CSRST.png" alt="PIC"
-    other_graphics_pattern = re.compile( r'(.*)src="(.*)" alt="PIC"(.*)' )
-    # Recognize numbered links
-    ##  href="fcla-jsmath-latestli81.html#archetype.I">
-    if linkbase:
-        link_pattern = re.compile( r'(.*)(href="'+linkbase+')(li|)([0-9]*)(.html)(.*)' )
 
-    # Cell counter for numbered "id" tags
-    cellnumber = -1
+    def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None):
+        r"""
+        Create a single worksheet from an HTML file.
 
-    # Accumulate content of worksheet file in a list
-    # Accumlate graphics filenames in a list
-    content=[]
-    graphics=[]
+        INPUT:
 
-    # May never find a title, so this is default
-    title = ''
+        - html_name - file name of HTML fiile
+        - css_name - an associated CSS file
+        - nb - a notebook to host worksheet creation temporarily
+        - user - the user directory for worksheets in this notebook
+        - linkinfo - reserved for linked worksheets
 
-    # State variables
-    doing_header = True
-    doing_body = False
-    doing_footer = False
-    doing_sage = False
+        OUTPUT:
 
-    # Link in CSS file, gets saved into worksheet data directory later
-    content.append( r'<link type="text/css" rel="stylesheet" href="'+css_name+r'" />' )
+        Returns a worksheet in ``nb``.
+        """
+        import re     # regular expressions for parsing
+        import shutil # file copy()
+        from sagenb.notebook.notebook import Notebook
 
-    # Lots of this is old and can be re-written better
-    # especially with MatchObject's and groups
-    htmlfile = open(html_name, 'r')
-    for aline in htmlfile.readlines():
-        # In reverse order, so state changes don't fall into wrong locations
-        if doing_footer:
-            pass  # just drop all lines of footer
-        elif doing_body:
-            # inspect, collect graphics filenames first
-            if re.match( svg_graphics_pattern, aline ):
-                base_name = re.sub( svg_graphics_pattern, r'\2', aline)
-                graphics.append( base_name[:-1] + '.svg' )
-            if re.match( other_graphics_pattern, aline ):
-                base_name = re.sub( other_graphics_pattern, r'\2', aline)
-                graphics.append( base_name[:-1] )
-            # massage links, presumes at most one per line
-            # handle top-level worksheet as worksheet zero
-            if linkbase:
-                m = re.match( link_pattern, aline)
-                if m:
-                    if m.group(4) == '': ws_number = '0'
-                    else: ws_number = m.group(4)
-                    aline = m.group(1) + 'href="../' + ws_number + m.group(6) + '\n'
-            if re.match( footer_start_pattern, aline ):
-                doing_body = False
-                doing_footer = True
-            elif re.match( sage_start_pattern, aline ):
-                doing_sage = True
-                cellnumber += 1
-                front = re.sub(sage_start_pattern, r'\1', aline)
-                back = re.sub(sage_start_pattern, r'\2', aline)
-                content.append(front + "{{{id=" + str(cellnumber) + "|" + back)
-            elif re.match( sage_end_pattern, aline ):
-                doing_sage = False
-                front = re.sub(sage_end_pattern, r'\1', aline)
-                back = re.sub(sage_end_pattern, r'\2', aline)
-                content.append(front[:-1] + "}}}\n" + back)
-            else:
-                # have a plain content line
-                content.append(aline)
-        elif doing_header:
-            if re.match(title_pattern, aline):
-                # Grab the title, which tex4ht is pretty good about finding
-                # Seems to have an extra newline?
-                title = re.sub(title_pattern, r'\2', aline)[:-1]
-            if re.match(header_end_pattern, aline):
-                doing_header = False
-                doing_body = True
-    htmlfile.close( )
+        #  Compiled patterns for key indicators in tex4ht output
+        #  Maybe this should be done just once as class-level objects
+        #    as part of a "Converter" object
+        #
+        #  Pull Sage worksheet title line out of HTML header title
+        title_pattern = re.compile(r'(.*)<title>(.*)</title>(.*)')
+        #  Start by trashing most of header, look for end location
+        header_end_pattern = re.compile( r'</noscript>' )
+        #  Recognize when content ends, javascript begins
+        footer_start_pattern = re.compile( r'.*<script type="text/javascript" >.*' )
+        #  Recognize when sage cell begins
+        sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
+        #  Recognize when sage cell begins
+        sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
+        # Identify SVG graphics files being included
+        # e.g.  <object data="graphics-test-1.svg" width="
+        svg_graphics_pattern = re.compile( r'(.*)<object data="(.*).svg" width="(.*)' )
+        # Identify other graphics files being included
+        # e.g.  src="CSRST.png" alt="PIC"
+        other_graphics_pattern = re.compile( r'(.*)src="(.*)" alt="PIC"(.*)' )
+        # Recognize numbered links
+        ##  href="fcla-jsmath-latestli81.html#archetype.I">
+        if linkbase:
+            link_pattern = re.compile( r'(.*)(href="'+linkbase+')(li|)([0-9]*)(.html)(.*)' )
 
-    # Build a worksheet in nb, and return it
-    #   Set title
-    #   Place CSS in data directory
-    #   Pack discovered graphics into data directory
-    #   Data directory does not exist initially
-    #   Side-effect of query is to build it
-    W = nb.create_new_worksheet(title, user)
-    W.edit_save(''.join(content))
-    datadir = W.data_directory()
-    shutil.copy("./"+css_name, datadir)
-    for filename in graphics:
-        shutil.copy("./"+filename, datadir)
-    nb.save_worksheet(W)
-    return W
+        # Cell counter for numbered "id" tags
+        cellnumber = -1
 
+        # Accumulate content of worksheet file in a list
+        # Accumlate graphics filenames in a list
+        content=[]
+        graphics=[]
 
-def create_single_sws( basename ):
-    r"""
-    Creates a single Sage worksheet in a portable sws format from a one-section LaTeX document.
+        # May never find a title, so this is default
+        title = ''
 
-    INPUT:
+        # State variables
+        doing_header = True
+        doing_body = False
+        doing_footer = False
+        doing_sage = False
 
-    - `basename` - a string this is the basename of the original
-      LaTeX input file and the basename of the tex4ht output.
-      So, for example, suppose your original file is foo.tex, and
-      when processed by tex4ht it produces an HTML/jsMath file called
-      foo.html, and an associated CSS file foo.css.  You would provide
-      `foo` as the input sting, and would end up creating ``foo.sws``.
-      So this routine will create a single worksheet faithfully representing
-      the original intent in the LaTeX file and possibly including Sage
-      compute cells. This assumes the necessary files are in the current
-      working directory.
+        # Link in CSS file, gets saved into worksheet data directory later
+        content.append( r'<link type="text/css" rel="stylesheet" href="'+css_name+r'" />' )
 
-    OUTPUT:  This routine creates a file  foo.sws  in the current working directory.
-    The return value is simply this filename as a string.
-    """
-    # We make a temporary notebook to work in
-    # This is located in $HOME/.sage/temp/hostname/pid/
-    # Temporary directory gets deleted automatically (as process end?)
-    from sage.misc.misc import tmp_dir
-    from sagenb.notebook.notebook import Notebook
-    nbdir = tmp_dir() + 'converter.sagenb'
-    nb = Notebook(nbdir)
-    W = convert_one_file(basename+'.html', basename+'.css', nb, 'admin')
-    nb.export_worksheet(W.filename(), basename+'.sws')
-    return basename+'.sws'
+        # Lots of this is old and can be re-written better
+        # especially with MatchObject's and groups
+        htmlfile = open(html_name, 'r')
+        for aline in htmlfile.readlines():
+            # In reverse order, so state changes don't fall into wrong locations
+            if doing_footer:
+                pass  # just drop all lines of footer
+            elif doing_body:
+                # inspect, collect graphics filenames first
+                if re.match( svg_graphics_pattern, aline ):
+                    base_name = re.sub( svg_graphics_pattern, r'\2', aline)
+                    graphics.append( base_name[:-1] + '.svg' )
+                if re.match( other_graphics_pattern, aline ):
+                    base_name = re.sub( other_graphics_pattern, r'\2', aline)
+                    graphics.append( base_name[:-1] )
+                # massage links, presumes at most one per line
+                # handle top-level worksheet as worksheet zero
+                if linkbase:
+                    m = re.match( link_pattern, aline)
+                    if m:
+                        if m.group(4) == '': ws_number = '0'
+                        else: ws_number = m.group(4)
+                        aline = m.group(1) + 'href="../' + ws_number + m.group(6) + '\n'
+                if re.match( footer_start_pattern, aline ):
+                    doing_body = False
+                    doing_footer = True
+                elif re.match( sage_start_pattern, aline ):
+                    doing_sage = True
+                    cellnumber += 1
+                    front = re.sub(sage_start_pattern, r'\1', aline)
+                    back = re.sub(sage_start_pattern, r'\2', aline)
+                    content.append(front + "{{{id=" + str(cellnumber) + "|" + back)
+                elif re.match( sage_end_pattern, aline ):
+                    doing_sage = False
+                    front = re.sub(sage_end_pattern, r'\1', aline)
+                    back = re.sub(sage_end_pattern, r'\2', aline)
+                    content.append(front[:-1] + "}}}\n" + back)
+                else:
+                    # have a plain content line
+                    content.append(aline)
+            elif doing_header:
+                if re.match(title_pattern, aline):
+                    # Grab the title, which tex4ht is pretty good about finding
+                    # Seems to have an extra newline?
+                    title = re.sub(title_pattern, r'\2', aline)[:-1]
+                if re.match(header_end_pattern, aline):
+                    doing_header = False
+                    doing_body = True
+        htmlfile.close( )
+
+        # Build a worksheet in nb, and return it
+        #   Set title
+        #   Place CSS in data directory
+        #   Pack discovered graphics into data directory
+        #   Data directory does not exist initially
+        #   Side-effect of query is to build it
+        W = nb.create_new_worksheet(title, user)
+        W.edit_save(''.join(content))
+        datadir = W.data_directory()
+        shutil.copy("./"+css_name, datadir)
+        for filename in graphics:
+            shutil.copy("./"+filename, datadir)
+        nb.save_worksheet(W)
+        return W
+
+
+    def _create_single_sws(self, basename):
+        r"""
+        Creates a single Sage worksheet in a portable sws format from a one-section LaTeX document.
+
+        INPUT:
+
+        - `basename` - a string this is the basename of the original
+        LaTeX input file and the basename of the tex4ht output.
+        So, for example, suppose your original file is foo.tex, and
+        when processed by tex4ht it produces an HTML/jsMath file called
+        foo.html, and an associated CSS file foo.css.  You would provide
+        `foo` as the input sting, and would end up creating ``foo.sws``.
+        So this routine will create a single worksheet faithfully representing
+        the original intent in the LaTeX file and possibly including Sage
+        compute cells. This assumes the necessary files are in the current
+        working directory.
+
+        OUTPUT:  This routine creates a file  foo.sws  in the current working directory.
+        The return value is simply this filename as a string.
+        """
+        # We make a temporary notebook to work in
+        # This is located in $HOME/.sage/temp/hostname/pid/
+        # Temporary directory gets deleted automatically (as process end?)
+        from sage.misc.misc import tmp_dir
+        from sagenb.notebook.notebook import Notebook
+        nbdir = tmp_dir() + 'converter.sagenb'
+        nb = Notebook(nbdir)
+        W = self._convert_one_file(basename+'.html', basename+'.css', nb, 'admin')
+        nb.export_worksheet(W.filename(), basename+'.sws')
+        return basename+'.sws'
+
+
+    def _create_tar_archive(self, basename):
+        # this is all ad-hoc
+        # long-term the notebook might be temporary, or not
+        # One approach would be a portable container, like sws
+        # Other would be to install directly in a user's notebook
+        from sagenb.notebook.notebook import Notebook
+        nbdir = "/tmp/fcla.sagenb"
+        nb=Notebook(nbdir)
+        nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True)
+        cssfilename = self._basename + '.css'
+        for _,htmlfilename in self._files:
+            print "Converting: ", htmlfilename
+            self._convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename)
+        nb.save() # for good measure
+        # Bundle up as an archive
+        # Get pathnames right for ez decompression as user
+        import tarfile, os
+        print "Forming tar archive..."
+        T = tarfile.open(basename+'.tar.bz2', 'w:bz2')
+        os.chdir(nbdir+'/home')
+        T.add('linear')
+        T.close()
+
+    def convert(self, format = None, dir = None):
+        r"""
+        The one public method.
+        """
+        if not format:
+            format = self._likely_format
+        if not dir:
+            dir = self._input_dir
+        # pass a directory to _create_single_sws?
+        if format == 'sws':
+            self._create_single_sws(self._basename)
+        if format == 'tar':
+            self._create_tar_archive(self._basename)
+
 
 ############################
 # Main
 ############################
 #
-# Inspect directory,
-# decide to build one worksheet as an sws,
-#   or
-# build a worksheet user-tree as an archive.
-# Latter is not practical long-term.
-#
-basename, files = explore_directory('./')  # generalize this
-if len(files) == 1:
-    if basename+'.html' != files[0][1]:
-        raise ValueError('HTML and CSS file have different base names')
-    create_single_sws( basename )
-else:
-    # this is all ad-hoc
-    # long-term the notebook might be temporary, or not
-    # One approach would be a portable container, like sws
-    # Other would be to install directly in a user's notebook
-    from sagenb.notebook.notebook import Notebook
-    nbdir = "/tmp/fcla.sagenb"
-    nb=Notebook(nbdir)
-    nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True)
-    cssfilename = basename + '.css'
-    for _,htmlfilename in files:
-        print "Converting: ", htmlfilename
-        convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename)
-    nb.save() # for good measure
-    # Bundle up as an archive
-    # Get pathnames right for ez decompression as user
-    import tarfile, os
-    print "Forming tar archive..."
-    T = tarfile.open(basename+'.tar.bz2', 'w:bz2')
-    os.chdir(nbdir+'/home')
-    T.add('linear')
-    T.close()
+# Create converter class
+# Call convert()
 
+t2s = TeXtoSWS()
+t2s.convert()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.