rbeezer avatar rbeezer committed 02d1408

Cleaned up, refactored, support zip archive for multipart docs, README may lag

Comments (0)

Files changed (1)

 
 class TeXtoSWS(object):
 
-    def __init__(self, input_dir=None, basename=None, output_file=None):
+    def __init__(self, input_dir=None, project=None, output_file=None):
         r"""
         Configure the working environment for a conversion,
-        making educated guesses when lacking explicit information.
+        providing sensible defaults for parameters not given.
 
         INPUT:
 
-        - ``input_dir`` - a directory that contains all of the
+        - ``input_dir`` - default: current working directory -
+          a directory that contains all of the
           output from a run of tex4ht on a latex file.  This
           directory should contain the associated graphics
-          files but we locate them later in the HTML sources.
-          If input as ``None`` the current directory is used.
+          files with the same relative pathnames as used with
+          the original latex sources.
 
-        - ``basename`` - a string that describes the base
+        - ``project`` - default: prefix of lone CSS file -
+          a string that describes the base
           for the filenames created by tex4ht.  For example,
           if the original tex file is ``foo.tex`` then the
           basename is ``foo`` and all of the tex4ht output
-          files hanve names beginning with this string.
-          If input as ``None`` then the ``input_dir`` is
-          searched for the existence of exactly one CSS file
-          and the basename is derived from that.
+          files have names beginning with this string.
+          Default is determined by searching  ``input_dir``
+          for the existence of exactly one CSS file
+          and the project name is derived from that.
 
-        - ``output_file`` - name for output Sage worksheet.
-          If input as ``None`` then it will be located in the
-          ``input_dir`` using the value of the ``basename``
-          and the extension ``.sws``.
+        - ``output_file`` - default: input directory + project name -
+          name for output file with either an sws or zip suffix.
+          Default is formed as a combination of ``input_dir``
+          and ``project`` with either  extension ``sws`` or
+          ``zip``, depending on if  there is just one HTML
+          file or several HTML files to be converted.
 
         OUTPUT:
         Besides the items mentioned above several other
         items are also computed and recorded in this routine.
 
-        First is a list of pairs.  The second part of each pair
-        is the filename for an HTML file.  The first part of the
-        pair is the (relative) worksheet number for that file,
-        with counting starting at zero, and as a string (not
-        an integer).  The pairs are sorted according to the
-        numerical value of this first string.
+        First is a list of trplies.  The first part of each triple
+        is the filename for an HTML file.  The first other parts
+        record information about the origin of each HTML file and
+        are used to sort the files into a reasonable order.
 
         Based on the number of HTML files discovered,
-        a ``_likely_format`` is set.
+        a ``project_zise`` (single or multiple) is set.
         """
         import os        # getcwd, listdir,
         import os.path   # splitext
         import re        # to massage filenames
 
+        # Directory with tex4ht output files
+        # TODO: Test existence, read access here?
+        # Default: current working directory
         if not input_dir:
             input_dir = os.getcwd()
         directory = os.listdir(input_dir)
 
         # tex4ht builds HTML file(s) and a single CSS file
-        # Infer basename of project from single CSS file in directory
+        # Default: infer project name from existence of a single CSS file in directory
         cssfiles = [afile for afile in directory if os.path.splitext(afile)[1]=='.css']
-        #print 'DIR: ', directory
-        #print 'CSS: ', cssfiles
-        if not basename:
+        if not project:
             if len(cssfiles) != 1:
-                raise ValueError('need exactly one CSS file in %s directory to determine project' % input_dir)
+                raise ValueError('No project specified and directory %s has no CSS files, or several' % input_dir)
             else:
-                basename = cssfiles[0][:-4]
+                project = cssfiles[0][:-4]
 
-        # Use input_dir and basename to form default worksheet filename
-        if not output_file:
-            output_file = os.path.join(input_dir, basename + '.sws')
+        # For multiple sections, we sort the filenames
+        # according to the type of their sections.  This list
+        # dictates the order, from finer to coarser, left to right
+        sec = ('li', 'ch', 'pa', '')
 
-        # Find all html files
-        # $ matches end-of-string, avoids backup-files with tildes (Robert Marik)
-        htmlfile_pattern = re.compile( r'^' + basename + r'(li|)([0-9]*)(.html)$' )
+        # Discover all html files in the input directory
+        #   $ matches end-of-string, avoids backup-files with tildes (Robert Marik)
+        #   pa=part, ch = chapter, li = list, blank  all possible from tex4ht
+        htmlfile_pattern = re.compile( r'^' + project + r'(pa|ch|li|)([0-9]*)(.html)$' )
         files = []
+        sec = ('li', 'ch', 'pa', '')
         for afile in directory:
             m = htmlfile_pattern.match(afile)
             if m:
-                ws_number = m.group(2)
+                sectioning = m.group(1)
+                number = m.group(2)
                 # Main HTML file does not get a number from tex4ht
-                # Fits best as worksheet 0 when there are multiple files
-                if not ws_number:
-                    ws_number = '0'
-                files.append((ws_number, afile))
-        files.sort(key=lambda f: int(f[0]))
+                # Fits best as 0 when there are multiple files
+                # Since tex4ht starts counting from 1
+                if not number:
+                    number = '0'
+                files.append((afile, sectioning, number))
+        # low level divisions first, then numerical reversed
+        files.sort( key=lambda f: (sec.index(f[1]), -int(f[2])) )
         if len(files) == 1:
-            self._likely_format = 'sws'
+            project_size = 'single'
         else:
-            # Need a new Sage container format here
-            self._likely_format = 'tar'
+            project_size = 'multiple'
+
+        #  Coordinate output file extension with nature of input directory
+        #  Default: input directory, project name, proper extension based on project size
+        if output_file:
+            output_extension = os.path.splitext(output_file)[1]
+            if not output_extension in ['.sws', '.zip']:
+                raise ValueError( 'output file (%s) must end with sws or zip' % output_file )
+            if output_extension == '.sws' and project_size == 'multiple':
+                raise ValueError('project has multiple HTML files, but output filename requests a single worksheet')
+            if output_extension == '.zip' and project_size == 'single':
+                raise ValueError('project has a single HTML file, but output filename requests a zip file of multiple worksheets')
+        else:
+            output_file = os.path.join(input_dir, project)
+            if project_size == 'single':
+                output_file = output_file + '.sws'
+            if project_size == 'multiple':
+                output_file = output_file + '.zip'
+
+        # Record the verified or discovered information for use in the class
+        # Long-term may need to support other output formats,
+        #   but right now project_size is enough to determine what to do
         self._input_dir = input_dir
-        self._basename = basename
+        self._project = project
+        self._project_size = project_size
         self._output_file = output_file
         self._files = files
 
                 e.setAttribute('src', new_src)
 
         # Find and modify links in place
+        # TODO: this is old, may be seriously broken
+        #       waiting for cross-worksheet link support
         if linkbase:
-            link_pattern = re.compile( r'^' + linkbase + r'(li|)([0-9]*)(.html)(.*)$' )
+            link_pattern = re.compile( r'^' + linkbase + r'(pa|ch|li| )([0-9]*)(.html)(.*)$' )
             for e in tree.getElementsByTagName('a'):
                 attr = e.attributes
                 if e.hasAttribute('href'):
         return titles[0], graphics, cells
 
 
-    def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None):
+    def _create_single_sws(self, html_name, sws_name):
         r"""
-        OBSOLETE
-        MAYBE FILENAMES/PATHS ARE BROKEN IN FULLY GENERAL USE
-        
-        Create a single worksheet from a parsed tex4ht XHTML file.
+        Creates a single Sage worksheet in sws format from a single HTML file.
 
-        INPUT:
-
-        - html_name - file name of HTML file
-        - css_name - an associated CSS file
-        - nb - a notebook to host worksheet creation temporarily
-        - user - the user directory for worksheets in this notebook
-        - linkinfo - reserved for linked worksheets
-
-        OUTPUT:
-
-        Returns a worksheet in ``nb``.
-        """
-        import shutil # file copy() to data directory
-        from sagenb.notebook.notebook import Notebook
-
-        title, graphics, cells = self._parse_tex4ht(html_name, linkbase)
-
-        # Link in CSS file as part of HTML version
-        # Add to filename list for data directory
-        content=[]
-        content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
-        graphics.append(css_name)
-
-        # Recognize cells, adorn compute cells
-        for c in cells:
-            if c[0] == 'plain':
-                content.append(c[1])
-            if c[0] == 'compute':
-                content.append('{{{' + c[1] + '}}}')
-
-        # Build a worksheet in nb, and return it
-        #   Set title
-        #   Place files in data directory
-        #   Pack discovered graphics into data directory
-        #   Data directory does not exist initially
-        #   Side-effect of query is to build it
-        W = nb.create_new_worksheet(title, user)
-        datadir = W.data_directory()
-        for filename in graphics:
-            shutil.copy("./"+filename, datadir)
-        W.edit_save(''.join(content).encode('ascii', 'xmlcharrefreplace'))
-        nb.save_worksheet(W)
-        return W
-
-
-    def _old_create_single_sws(self, basename):
-        r"""
-        OBSOLETE
-        MAYBE FILENAMES/PATHS ARE BROKEN IN FULLY GENERAL USE
-
-        # We make a temporary notebook to work in
-        # This is located in $HOME/.sage/temp/hostname/pid/
-        # Temporary directory gets deleted automatically (as process ends?)
-        """
-        from sage.misc.misc import tmp_dir
-        from sagenb.notebook.notebook import Notebook
-        nbdir = tmp_dir() + 'converter.sagenb'
-        nb = Notebook(nbdir)
-        W = self._convert_one_file(basename+'.html', basename+'.css', nb, 'admin')
-        nb.export_worksheet(W.filename(), basename+'.sws')
-        return basename+'.sws'
-
-    def _create_single_sws(self):
-        r"""
-        Creates a single Sage worksheet in a portable sws format from a one-section LaTeX document.
+        # html_name is just filename, no path, assume in input_dir
+        # sws_name is fully-qualified sws filename
 
         This routine creates a worksheet "from scratch" using just Python
         and none of the notebook code.  This makes for quicker startup times
         import os.path
 
         input_dir = self._input_dir
-        basename = self._basename
-        output_file = self._output_file
+        css_name = self._project + '.css'
+        html_name = os.path.join(input_dir, html_name)
 
         # Break out tex4ht output
-        # There should not be any cross-worksheet links,
+        # We are not converting any cross-worksheet links,
         #   so we don't pass a base for the linking URLs
-        html_name = os.path.join(input_dir, basename + '.html')
         title, graphics, cells = self._parse_tex4ht(html_name, None)
 
         # Piece back together in worksheet format
         # We first link in the CSS information from the data directory
         # The CSS file should be in with the graphics files, so doesn't need a path,
         # We add it to the data directory along with all the graphics files
-        css_name = basename + '.css'
         content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
         graphics.append(css_name)
 
 
         # Build sws as a tar file, with expected naming conventions
         prefix = 'sage_worksheet'
-        T = tarfile.open(output_file, 'w:bz2')
+        T = tarfile.open(sws_name, 'w:bz2')
 
         # Pickled configuration file
         fd, configfile =  tempfile.mkstemp()
             T.add(os.path.join(input_dir, f), os.path.join(dataprefix, base))
 
         T.close()
-        return output_file
+        return None
 
-    def _create_tar_archive(self, basename):
-        # this is all ad-hoc for testing
-        # long-term the notebook might be temporary, or not
-        # One approach would be a portable container, like sws
-        # Other would be to install directly in a user's notebook
-        from sagenb.notebook.notebook import Notebook
-        nbdir = "/tmp/fcla.sagenb"
-        nb=Notebook(nbdir)
-        nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True)
-        cssfilename = basename + '.css'
-        for _,htmlfilename in self._files:
-            print "Converting: ", htmlfilename
-            self._convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename)
-        nb.save() # for good measure
-        # Bundle up as an archive
-        # Get pathnames right for ez decompression as user
-        import tarfile, os
-        print "Forming tar archive..."
-        T = tarfile.open(basename+'.tar.bz2', 'w:bz2')
-        os.chdir(nbdir+'/home')
-        T.add('linear')
-        T.close()
+    def _create_zip_archive(self):
+        r"""
+        Package (related) worksheets into one zip archive for easy upload into a notebook.
 
+        OUTPUT:  A zip archive of each HTML file the input directory, in sws format.
+        The notebook allows for uploading all these at once when packaged this way.
+        The file name is dtermined when the converter is initialized.
+        """
+        from tempfile import mkdtemp
+        import os.path
+        import zipfile as zf
 
-    def convert(self, format = None):
+        archive = zf.ZipFile(self._output_file, 'w')
+        td = mkdtemp()
+        for html_name, _, _ in self._files:
+            sws_name = html_name[:-5]+'.sws'
+            sws_file = os.path.join(td, sws_name)
+            print "Converting: ", html_name
+            self._create_single_sws(html_name, sws_file)
+            archive.write(sws_file, sws_name)
+        archive.close()
+
+
+    def convert(self):
         r"""
         The one public method.
         """
-        if not format:
-            format = self._likely_format
-        # pass a directory to _create_single_sws?
-        if format == 'sws':
-            self._create_single_sws()
-        if format == 'tar':
-            self._create_tar_archive(self._basename)
-        ## Calls to testing routines, not permanent
-        if format == 'xml-test':
-            print self._parse_tex4ht(self._basename+'.html', self._basename)
-        if format == 'pure-python':
-            print self._pure_python(self._basename)
+        if self._project_size == 'single':
+            self._create_single_sws(self._project + '.html', self._output_file)
+        if self._project_size == 'multiple':
+            self._create_zip_archive()
+
 
 ############################
 # Main
 ############################
 #
-# Analyze command line
+# Parse command line
 # Create converter class
 # Call convert()
 
                     action = "store", dest = "input_dir",
                     help = "input directory with tex4ht output")
 
-optparse.add_option("-b", "--basename",
-                    action = "store", dest = "basename",
-                    help = "project basename, eg foo.tex has basename 'foo'")
+optparse.add_option("-p", "--project",
+                    action = "store", dest = "project",
+                    help = "project name, eg foo.tex has project name 'foo'")
 
 optparse.add_option("-o", "--output_file",
                     action = "store", dest = "output_file",
-                    help = "filename for Sage worksheet")
+                    help = "output filename (.sws or .zip)")
 opts, args = optparse.parse_args()
 
-# Build a converter
+# Build a converter, perhaps saying so
+if opts.verbose:
+    print "Job: Converting %s project in %s to %s." % (t2s._project, t2s._input_dir, t2s._output_file)
+
 t2s = TeXtoSWS(input_dir = opts.input_dir,
-               basename = opts.basename,
+               project = opts.project,
                output_file = opts.output_file)
-if opts.verbose:
-    print "Job: Converting %s project in %s to %s." % (t2s._basename, t2s._input_dir, t2s._output_file)
 
 # Do the conversion
 t2s.convert()
-## Testing, experimental calls
-## t2s.convert(format='xml-test')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.