Commits

rbeezer committed 0436dd0

Removed old parse-tex4ht

Comments (0)

Files changed (1)

parse-tex4ht

-#! /usr/bin/env sage
-
-#  Copyright 2009 Robert A. Beezer
-
-#  2008/11/16  Initiated
-#  2009/01/28  Now parse-tex4ht, working with GT primer
-#  2010/01/27  worksheet.html now begins with just a blank line
-#  2010/01/29  Uses notebook code to build worksheet file
-#  2010/01/31  Pull in SVG's to data directory for tikz support
-#  2010/02/04  One-file conversion as function call
-#  2010/02/04  Better identification of included graphics files
-
-#  Command line arguments
-#  1. Base filename for original LaTeX file fed to tex4ht
-#  2. Destination sage worksheet sws file, full path
-
-# Must call tex4ht with private  tex4ht-sage.cfg  config file
-
-import sys  # command line arguments
-from sagenb.notebook.notebook import Notebook
-
-def convert_one_file(html_name, css_name, sws_name, nb, linkinfo=None):
-    r"""
-    Create a single worksheet in *.sws format from an HTML file
-
-    INPUT:
-
-    - html_name - file name of HTML fiile
-    - css_name - an associated CSS file
-    - sws_name - file name of output
-    - nb - a notebook to host worksheet creation temporarily
-    - linkinfo - reserved for linked worksheets
-
-    OUTPUT:
-
-    Returns ``None``, as main side-effect is to create a file
-    in *.sws format.
-    """
-    import re     # regular expressions for parsing
-    import shutil # file copy()
-
-    #  Compiled patterns for key indicators in tex4ht output
-    #  Pull Sage worksheet title line out of HTML header title
-    title_pattern = re.compile(r'(.*)<title>(.*)</title>(.*)')
-    #  Start by trashing most of header, look for end location
-    header_end_pattern = re.compile( r'</noscript>' )
-    #  Recognize when content ends, javascript begins
-    footer_start_pattern = re.compile( r'.*<script type="text/javascript" >.*' )
-    #  Recognize when sage cell begins
-    sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
-    #  Recognize when sage cell begins
-    sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
-    # Identify SVG graphics files being included
-    # e.g.  <object data="graphics-test-1.svg" width="
-    svg_graphics_pattern = re.compile( r'(.*)<object data="(.*).svg" width="(.*)' )
-    # Identify other graphics files being included
-    # e.g.  src="CSRST.png" alt="PIC"
-    other_graphics_pattern = re.compile( r'(.*)src="(.*)" alt="PIC"(.*)' )
-    # Recognize numbered links
-    ##  href="fcla-jsmath-latestli81.html#archetype.I">
-    link_pattern = re.compile( r'(.*)(href="fcla-jsmath-latest)(li|)([0-9]*)(.html)(.*)' )
-
-    # Cell counter for numbered "id" tags
-    cellnumber = -1
-
-    # Accumulate content of worksheet file in a list
-    # Accumlate SVG file names in a list
-    content=[]
-    graphics=[]
-
-    # State variables
-    doing_header = True
-    doing_body = False
-    doing_footer = False
-    doing_sage = False
-
-    htmlfile = open(html_name, 'r')
-    for aline in htmlfile.readlines():
-        # In reverse order, so state changes don't fall into wrong locations
-        if doing_footer:
-            pass  # just drop all lines of footer
-        elif doing_body:
-            # inspect, collect graphics filenames first
-            if re.match( svg_graphics_pattern, aline ):
-                base_name = re.sub( svg_graphics_pattern, r'\2', aline)
-                graphics.append( base_name[:-1] + '.svg' )
-            if re.match( other_graphics_pattern, aline ):
-                base_name = re.sub( other_graphics_pattern, r'\2', aline)
-                graphics.append( base_name[:-1] )
-            # massage links, presumes at most one per line
-            m = re.match( link_pattern, aline)
-            if m:
-                aline = m.group(1) + 'href="../' + m.group(4) + m.group(6) + '\n'
-            if re.match( footer_start_pattern, aline ):
-                doing_body = False
-                doing_footer = True
-            elif re.match( sage_start_pattern, aline ):
-                doing_sage = True
-                cellnumber += 1
-                front = re.sub(sage_start_pattern, r'\1', aline)
-                back = re.sub(sage_start_pattern, r'\2', aline)
-                content.append(front + "{{{id=" + str(cellnumber) + "|" + back)
-            elif re.match( sage_end_pattern, aline ):
-                doing_sage = False
-                front = re.sub(sage_end_pattern, r'\1', aline)
-                back = re.sub(sage_end_pattern, r'\2', aline)
-                content.append(front[:-1] + "}}}\n" + back)
-            else:
-                # have a plain content line
-                content.append(aline)
-        elif doing_header:
-            if re.match(title_pattern, aline) != None :
-                # Grab the title, which tex4ht is pretty good about finding
-                # Seems to have an extra newline?
-                title = re.sub(title_pattern, r'\2', aline)[:-1]
-            if re.match(header_end_pattern, aline) != None :
-                # Pull in the whole CSS file built by tex4ht
-                cssfile = open( css_name, 'r')
-                content.append( "<style>\n" )
-                content += cssfile.readlines()
-                content.append( "</style>\n" )
-                cssfile.close()
-                doing_header = False
-                doing_body = True
-    htmlfile.close( )
-
-    # Build a worksheet in nb, and export it
-    #   Create a worksheet with a title
-    #   Populate worksheet, export and clean up
-    #   SVG files built by tex4ht & pgf/tikz get rounded up here as well
-    W = nb.create_new_worksheet(title, 'admin')
-    W.edit_save(''.join(content))
-    # Data directory does not exist, side-effect of next call is to build it
-    # Add a dash to texfilename so subnames are not caught
-    # Maybe match on #'s at end too?
-    datadir = W.data_directory()
-    for filename in graphics:
-        shutil.copy("./"+filename, datadir)
-    nb.export_worksheet(W.filename(), sws_name)
-    # End convert_one_file()
-
-
-
-# Experiments with cross-section linking (ignore)
-# href="fcla-jsmath-2.01li23.html#theorem.VSPCV">
-# href="fcla-jsmath-2.01.html">  for "up" links on sections
-# prefix = "fcla-jsmath-2.01"
-# prefix = "link-test" 
-# base_pattern = re.compile(r'href="' + prefix + r'\.html')
-# link_pattern = re.compile(r'href="' + prefix + r'li(\d*)\.html')
-
-
-# tex4ht builds an HTML file and a CSS filename
-# both with the same basename from argument 1
-# Specify full path/file name for eventual product from argumant 2
-texfilebasename = sys.argv[1]
-htmlfilename = texfilebasename + ".html"
-cssfilename = texfilebasename + ".css"
-swsfilename = sys.argv[2]
-
-#  Build temporary notebook, then trash it
-nbdir = "./temp.sagenb"
-nb=Notebook(nbdir)
-convert_one_file(htmlfilename, cssfilename, swsfilename, nb)
-nb.delete()
-