tex2sws / tex2sws.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#! /usr/bin/env python

################################################################################
#            Copyright 2010 Robert A. Beezer <beezer@ups.edu>
#
#  Distributed under the terms of the GNU General Public License (GPL),
#  version 2 or any later version.  The full text of the GPL is available at:
#
#                     http://www.gnu.org/licenses/
################################################################################

class TeXtoSWS(object):

    def __init__(self, input_dir=None, project=None, output_file=None):
        r"""
        Configure the working environment for a conversion,
        providing sensible defaults for parameters not given.

        INPUT:

        - ``input_dir`` - default: current working directory -
          a directory that contains all of the
          output from a run of tex4ht on a latex file.  This
          directory should contain the associated graphics
          files with the same relative pathnames as used with
          the original latex sources.

        - ``project`` - default: prefix of lone CSS file -
          a string that describes the base
          for the filenames created by tex4ht.  For example,
          if the original tex file is ``foo.tex`` then the
          basename is ``foo`` and all of the tex4ht output
          files have names beginning with this string.
          Default is determined by searching  ``input_dir``
          for the existence of exactly one CSS file
          and the project name is derived from that.

        - ``output_file`` - default: input directory + project name -
          name for output file with either an sws or zip suffix.
          Default is formed as a combination of ``input_dir``
          and ``project`` with either  extension ``sws`` or
          ``zip``, depending on if  there is just one HTML
          file or several HTML files to be converted.

        OUTPUT:
        Besides the items mentioned above several other
        items are also computed and recorded in this routine.

        First is a list of trplies.  The first part of each triple
        is the filename for an HTML file.  The first other parts
        record information about the origin of each HTML file and
        are used to sort the files into a reasonable order.

        Based on the number of HTML files discovered,
        a ``project_zise`` (single or multiple) is set.
        """
        import os        # getcwd, listdir,
        import os.path   # splitext
        import re        # to massage filenames

        # Directory with tex4ht output files
        # TODO: Test existence, read access here?
        # Default: current working directory
        if not input_dir:
            input_dir = os.getcwd()
        directory = os.listdir(input_dir)

        # tex4ht builds HTML file(s) and a single CSS file
        # Default: infer project name from existence of a single CSS file in directory
        cssfiles = [afile for afile in directory if os.path.splitext(afile)[1]=='.css']
        if not project:
            if len(cssfiles) != 1:
                raise ValueError('No project specified and directory %s has no CSS files, or several' % input_dir)
            else:
                project = cssfiles[0][:-4]

        # For multiple sections, we sort the filenames
        # according to the type of their sections.  This list
        # dictates the order, from finer to coarser, left to right
        sec = ('li', 'ch', 'pa', '')

        # Discover all html files in the input directory
        #   $ matches end-of-string, avoids backup-files with tildes (Robert Marik)
        #   pa=part, ch=chapter, li=list, se=section, blank  all possible from tex4ht
        htmlfile_pattern = re.compile( r'^' + project + r'(pa|ch|li|se|)([0-9]*)(.html)$' )
        files = []
        sec = ('li', 'ch', 'pa', 'se', '')
        for afile in directory:
            m = htmlfile_pattern.match(afile)
            if m:
                sectioning = m.group(1)
                number = m.group(2)
                # Main HTML file does not get a number from tex4ht
                # Fits best as 0 when there are multiple files
                # Since tex4ht starts counting from 1
                if not number:
                    number = '0'
                files.append((afile, sectioning, number))
        # low level divisions first, then numerical reversed
        files.sort( key=lambda f: (sec.index(f[1]), -int(f[2])) )
        if len(files) == 1:
            project_size = 'single'
        else:
            project_size = 'multiple'

        #  Coordinate output file extension with nature of input directory
        #  Default: input directory, project name, proper extension based on project size
        if output_file:
            output_extension = os.path.splitext(output_file)[1]
            if not output_extension in ['.sws', '.zip']:
                raise ValueError( 'output file (%s) must end with sws or zip' % output_file )
            if output_extension == '.sws' and project_size == 'multiple':
                raise ValueError('project has multiple HTML files, but output filename requests a single worksheet')
            if output_extension == '.zip' and project_size == 'single':
                raise ValueError('project has a single HTML file, but output filename requests a zip file of multiple worksheets')
        else:
            output_file = os.path.join(input_dir, project)
            if project_size == 'single':
                output_file = output_file + '.sws'
            if project_size == 'multiple':
                output_file = output_file + '.zip'

        # Record the verified or discovered information for use in the class
        # Long-term may need to support other output formats,
        #   but right now project_size is enough to determine what to do
        self._input_dir = input_dir
        self._project = project
        self._project_size = project_size
        self._output_file = output_file
        self._files = files


    def _parse_tex4ht(self, html_name, linkbase=None):
        r"""
        Bust up tex4ht output into a

        - title - a string
        - graphics - a list of filenames
        - cells - list of pairs ('plain'|'compute', <contents>)
          where contents are XHTML, or un-delimited Sage code
        """
        import xml.dom.minidom as dom
        import re     # regular expressions for parsing
        import os

        #  Using verbatim environments for Sage code
        #  allows some XML escape codes to slip through.
        #  <,> are two obvious ones and easy to handle.
        #  The XML escape character, &, is trickier.
        #  We only protect against breaking character
        #  codes like &#1234;  but not codes like &lt;.
        #
        #  Recognize when sage cells begin or end
        sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
        sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
        #  Ampersands that don't begin a character code
        ampersand_pattern = re.compile( r'(&(?!#[0-9]*;))' )

        sage_block = False
        xmlcontent = []
        html_file = open(html_name,'r')
        ## count = 0
        for aline in html_file.readlines():
            if sage_block and sage_end_pattern.match(aline):
                sage_block = False
            elif sage_block:
                pieces = ampersand_pattern.split(aline)
                if len(pieces)>1:
                    for i in range(len(pieces)):
                        if pieces[i] == '&':
                            pieces[i] = r'&#38;'
                    aline = ''.join(pieces)
                aline = aline.replace('<', r'&#60;')
                aline = aline.replace('>', r'&#62;')
            elif not(sage_block) and sage_start_pattern.match(aline):
                sage_block = True
            xmlcontent.append(aline)
            ## count+=1
            ## print count, aline

        # Can now parse valid XHTML
        tree = dom.parseString( ''.join(xmlcontent) )

        # Find a title (all of them really)
        titles = []
        try:
            # Grabs the title including diacritics (if any).
            # Fails, if the title contains complicated structure
            # (for example from word \LaTeX in title)
            for e in tree.getElementsByTagName('h2'):
                if e.getAttribute('class') == 'titleHead':
                    for text in e.childNodes:
                        titles.append(text.data)
        except:
            pass
        if not titles:
            for e in tree.getElementsByTagName('title'):
                for text in e.childNodes:
                    titles.append(text.data)
        if not titles:
            titles = ['']

        # Find SVG graphics from pgf/tikz placed by tex4ht
        graphics = []
        for e in tree.getElementsByTagName('object'):
            if e.hasAttribute('data'):
                graphics.append(e.getAttribute('data'))
                new_data = os.path.basename(e.getAttribute('data'))
                new_data = os.path.join('data', new_data)
                e.setAttribute('data', new_data)

        # Mirror above to grab "regular" graphicx \includegraphics
        for e in tree.getElementsByTagName('img'):
            if e.hasAttribute('src'):
                graphics.append(e.getAttribute('src'))
                new_src = os.path.basename(e.getAttribute('src'))
                new_src = os.path.join('data', new_src)
                e.setAttribute('src', new_src)

        # Find and modify links in place
        # TODO: this is old, may be seriously broken
        #       waiting for cross-worksheet link support
        if linkbase:
            link_pattern = re.compile( r'^' + linkbase + r'(pa|ch|li| )([0-9]*)(.html)(.*)$' )
            for e in tree.getElementsByTagName('a'):
                attr = e.attributes
                if e.hasAttribute('href'):
                    url = e.getAttribute('href')
                    m = link_pattern.match(url)
                    if m:
                        # Handle '' as 0 worksheet
                        ws_number = m.group(2)
                        if not ws_number:
                            ws_number ='0'
                        newlink = '../' + ws_number + '/' + m.group(4)
                        # Change it here
                        e.setAttribute('href', newlink)

        # Ignore headers/footers by starting with body tag
        # Move nested <sage>...</sage> tags to <body>...</body> 
        # Collect text between compute cells
        # Identify text cells with <sage>,</sage> tag
        # as produced by custom configuration file for tex4ht
        bodies = tree.getElementsByTagName('body')
        thebody = bodies[0]
        for e in tree.getElementsByTagName('sage').__reversed__():
            if e.parentNode.tagName != 'body':
                f = e
                while f.parentNode.tagName != 'body':
                    f = f.parentNode 
                f.parentNode.insertBefore(e,f.nextSibling)
        cells = []
        content = []
        for e in thebody.childNodes:
            if e.nodeType == dom.Node.ELEMENT_NODE:
                tag = e.tagName
                if not(tag in ['script', 'noscript', 'sage']):
                    content.append(e.toxml())
                if tag == 'sage':
                    cells.append(('plain', ''.join(content)))
                    content=[]
                    # Assume <sage>, </sage> block has just one child
                    # AND text is 7-bit ASCII at this point
                    cells.append(('compute', e.firstChild.data))
        if content:
            cells.append(('plain', ''.join(content)))
        return titles[0], graphics, cells


    def _create_single_sws(self, html_name, sws_name):
        r"""
        Creates a single Sage worksheet in sws format from a single HTML file.

        # html_name is just filename, no path, assume in input_dir
        # sws_name is fully-qualified sws filename

        This routine creates a worksheet "from scratch" using just Python
        and none of the notebook code.  This makes for quicker startup times
        and the ability to run without Sage present.

        OUTPUT:  This routine creates a Sage worksheet in the file whose
        name is ``self._output_file``. The return value is simply this
        filename as a string.
        """
        import time  # for last change in pickled worksheet info
        import tempfile
        import tarfile
        import cPickle
        import os
        import os.path

        input_dir = self._input_dir
        css_name = self._project + '.css'
        html_name = os.path.join(input_dir, html_name)

        # Break out tex4ht output
        # We are not converting any cross-worksheet links,
        #   so we don't pass a base for the linking URLs
        title, graphics, cells = self._parse_tex4ht(html_name, None)

        # Piece back together in worksheet format
        # "content" will be the text representation of a worksheet
        content=[]

        # We first link in the CSS information from the data directory
        # The CSS file should be in with the graphics files, so doesn't need a path,
        # We add it to the data directory along with all the graphics files
        content.append( r'<link type="text/css" rel="stylesheet" href="data/' + css_name + r'" />' )
        graphics.append(css_name)

        # Recognize cells, adorn compute cells
        for c in cells:
            if c[0] == 'plain':
                content.append(c[1])
            if c[0] == 'compute':
                content.append('{{{' + c[1] + '}}}')

        # Make a generic worksheet configuration as a Python dictionary
        basic = {
            'name':title,
            'system':'sage',
            'owner':'admin',
            'last_change':('admin', time.time()),
            }

        # Build sws as a tar file, with expected naming conventions
        prefix = 'sage_worksheet'
        T = tarfile.open(sws_name, 'w:bz2')

        # Pickled configuration file
        fd, configfile =  tempfile.mkstemp()
        config = cPickle.dumps(basic)
        open(configfile, 'w').write(config)
        T.add(configfile, os.path.join(prefix, 'worksheet_conf.pickle'))
        os.unlink(configfile)
        os.fdopen(fd,'w').close()

        # Worksheet files, new and old styles
        body = ''.join(content).encode('ascii', 'xmlcharrefreplace')
        fd, wsfile =  tempfile.mkstemp()
        open(wsfile, 'w').write(body)
        T.add(wsfile, os.path.join(prefix, 'worksheet.html'))
        os.unlink(wsfile)
        os.fdopen(fd,'w').close()
        #  For older versions of notebook, backward compatible
        #  Just have two extra lines of info in header
        header = [title, '\n', 'system:', basic['system'], '\n']
        header = ''.join(header).encode('ascii', 'xmlcharrefreplace')
        fd_old, oldwsfile =  tempfile.mkstemp()
        open(oldwsfile, 'w').write(header + body)
        T.add(oldwsfile, os.path.join(prefix, 'worksheet.txt'))
        os.unlink(oldwsfile)
        os.fdopen(fd_old,'w').close()
        #  End backward compatibility

        # Data files, graphics, css, whatever
        dataprefix = os.path.join(prefix, 'data')
        for f in graphics:
            base = os.path.split(f)[1]
            T.add(os.path.join(input_dir, f), os.path.join(dataprefix, base))

        T.close()
        return None

    def _create_zip_archive(self):
        r"""
        Package (related) worksheets into one zip archive for easy upload into a notebook.

        OUTPUT:  A zip archive of each HTML file the input directory, in sws format.
        The notebook allows for uploading all these at once when packaged this way.
        The file name is dtermined when the converter is initialized.
        """
        from tempfile import mkdtemp
        import os.path
        import zipfile as zf

        archive = zf.ZipFile(self._output_file, 'w')
        td = mkdtemp()
        for html_name, _, _ in self._files:
            sws_name = html_name[:-5]+'.sws'
            sws_file = os.path.join(td, sws_name)
            print "Converting: ", html_name
            self._create_single_sws(html_name, sws_file)
            archive.write(sws_file, sws_name)
        archive.close()


    def convert(self):
        r"""
        The one public method.
        """
        if self._project_size == 'single':
            self._create_single_sws(self._project + '.html', self._output_file)
        if self._project_size == 'multiple':
            self._create_zip_archive()


############################
# Main
############################
#
# Parse command line
# Create converter class
# Call convert()

from optparse import OptionParser
optparse = OptionParser(usage = r"""usage: %prog [options]""")
optparse.add_option("-v", "--verbose",
                    action = "store_true", dest = "verbose", default = False,
                    help = "print progress messages")

optparse.add_option("-i", "--input_directory",
                    action = "store", dest = "input_dir",
                    help = "input directory with tex4ht output")

optparse.add_option("-p", "--project",
                    action = "store", dest = "project",
                    help = "project name, eg foo.tex has project name 'foo'")

optparse.add_option("-o", "--output_file",
                    action = "store", dest = "output_file",
                    help = "output filename (.sws or .zip)")
opts, args = optparse.parse_args()

# Build a converter, perhaps saying so
if opts.verbose:
    print "Job: Converting %s project in %s to %s." % (t2s._project, t2s._input_dir, t2s._output_file)

t2s = TeXtoSWS(input_dir = opts.input_dir,
               project = opts.project,
               output_file = opts.output_file)

# Do the conversion
t2s.convert()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.