Source

tex2sws / tex2sws.py

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#! /usr/bin/env sage
#
# Use sage to pick up sagenb notebook library
# To fully test experimental pure Python, replace "sage" with "python"

################################################################################
#            Copyright 2010 Robert A. Beezer <beezer@ups.edu>
#
#  Distributed under the terms of the GNU General Public License (GPL),
#  version 2 or any later version.  The full text of the GPL is available at:
#
#                     http://www.gnu.org/licenses/
################################################################################

class TeXtoSWS(object):

    def __init__(self, input_dir=None ):
        r"""
        Discover as much as possible about files that were
        output by tex4ht to a directory.

        INPUT:

        - ``input_dir`` - a directory that contains all of the
        output from a run of tex4ht on a latex file.  This
        directory should contain the associated graphics
        files but we locate them later in the HTML sources.

        OUTPUT:

        Several items are set here.  One is a ``basename`` which
        tex4ht will have derived from the original LaTeX source
        file.  So if we begin with ``foo.tex`` all of the
        files involved will begin with ``foo``.

        Another item returned is a list of pairs.  The
        second part of each pair is the filename for an
        HTML file.  The first part of the pair is the
        (relative) worksheet number for that file, with
        counting starting at zero, and as a string (not
        an integer). The pairs are sorted according to
        the numerical value of this first string.

        The directory where all these files live is recorded
        as ``self._input_dir``.  Based on the number of HTML
        files discovered, a ``_likely_format`` is set.
        """
        from os import listdir  # to inspect directory
        import re               # to massage filenames

        if not input_dir:
            input_dir = './'
        directory = listdir(input_dir)

        # tex4ht builds HTML files and a CSS file
        # Infer basename of project from single CSS file in directory
        # Use this to find all HTML files
        cssfiles = [afile for afile in directory if afile.endswith('.css')]
        if len(cssfiles) != 1:
            raise ValueError('no CSS file, or multiple CSS files in directory')
        cssfilename = cssfiles[0]
        basename = cssfilename[:-4]

        # Find all html files
        # $ matches end-of-string, avoids backup-files with tildes (Robert Marik)
        htmlfile_pattern = re.compile( r'^' + basename + r'(li|)([0-9]*)(.html)$' )
        files = []
        for afile in directory:
            m = htmlfile_pattern.match(afile)
            if m:
                ws_number = m.group(2)
                # Main HTML file does not get a number from tex4ht
                # Fits best as worksheet 0 when there are multiple files
                if not ws_number:
                    ws_number = '0'
                files.append((ws_number, afile))
        files.sort(key=lambda f: int(f[0]))
        if len(files) == 1:
            self._likely_format = 'sws'
        else:
            # Need a new Sage container format here
            self._likely_format = 'tar'
        self._input_dir = input_dir
        self._basename = basename
        self._files = files


    def _parse_tex4ht(self, html_name, linkbase=None):
        r"""
        Bust up tex4ht output into a

        - title - a string
        - graphics - a list of filenames
        - cells - list of pairs ('plain'|'compute', <contents>)
          where contents are XHTML, or un-delimited Sage code
        """
        import xml.dom.minidom as dom
        import re     # regular expressions for parsing
        import os

        #  Using verbatim environments for Sage code
        #  allows some XML escape codes to slip through.
        #  <,> are two obvious ones and easy to handle.
        #  The XML escape character, &, is trickier.
        #  We only protect against breaking character
        #  codes like &#1234;  but not codes like &lt;.
        #
        #  Recognize when sage cells begin or end
        sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
        sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
        #  Ampersands that don't begin a character code
        ampersand_pattern = re.compile( r'(&(?!#[0-9]*;))' )

        sage_block = False
        xmlcontent = []
        html_file = open(html_name,'r')
        ## count = 0
        for aline in html_file.readlines():
            if sage_block and sage_end_pattern.match(aline):
                sage_block = False
            elif sage_block:
                pieces = ampersand_pattern.split(aline)
                if len(pieces)>1:
                    for i in range(len(pieces)):
                        if pieces[i] == '&':
                            pieces[i] = r'&#38;'
                    aline = ''.join(pieces)
                aline = aline.replace('<', r'&#60;')
                aline = aline.replace('>', r'&#62;')
            elif not(sage_block) and sage_start_pattern.match(aline):
                sage_block = True
            xmlcontent.append(aline)
            ## count+=1
            ## print count, aline

        # Can now parse valid XHTML
        tree = dom.parseString( ''.join(xmlcontent) )

        # Find a title (all of them really)
        titles = []
        try:
            # Grabs the title including diacritics (if any).
            # Fails, if the title contains complicated structure
            # (for example from word \LaTeX in title)
            for e in tree.getElementsByTagName('h2'):
                if e.getAttribute('class') == 'titleHead':
                    for text in e.childNodes:
                        titles.append(text.data)
        except:
            pass
        if not titles:
            for e in tree.getElementsByTagName('title'):
                for text in e.childNodes:
                    titles.append(text.data)
        if not titles:
            titles = ['']

        # Find SVG graphics from pgf/tikz placed by tex4ht
        graphics = []
        for e in tree.getElementsByTagName('object'):
            if e.hasAttribute('data'):
                graphics.append(e.getAttribute('data'))

        # Mirror above to grab "regular" graphicx \includegraphics
        for e in tree.getElementsByTagName('img'):
            if e.hasAttribute('src'):
                graphics.append(e.getAttribute('src'))
                new_src = os.path.basename(e.getAttribute('src'))
                e.setAttribute('src', new_src)

        # Find and modify links in place
        if linkbase:
            link_pattern = re.compile( r'^' + linkbase + r'(li|)([0-9]*)(.html)(.*)$' )
            for e in tree.getElementsByTagName('a'):
                attr = e.attributes
                if e.hasAttribute('href'):
                    url = e.getAttribute('href')
                    m = link_pattern.match(url)
                    if m:
                        # Handle '' as 0 worksheet
                        ws_number = m.group(2)
                        if not ws_number:
                            ws_number ='0'
                        newlink = '../' + ws_number + '/' + m.group(4)
                        # Change it here
                        e.setAttribute('href', newlink)

        # Ignore headers/footers by starting with body tag
        # Move nested <sage>...</sage> tags to <body>...</body> 
        # Collect text between compute cells
        # Identify text cells with <sage>,</sage> tag
        # as produced by custom configuration file for tex4ht
        bodies = tree.getElementsByTagName('body')
        thebody = bodies[0]
        for e in tree.getElementsByTagName('sage').__reversed__():
            if e.parentNode.tagName != 'body':
                f = e
                while f.parentNode.tagName != 'body':
                    f = f.parentNode 
                f.parentNode.insertBefore(e,f.nextSibling)
        cells = []
        content = []
        for e in thebody.childNodes:
            if e.nodeType == dom.Node.ELEMENT_NODE:
                tag = e.tagName
                if not(tag in ['script', 'noscript', 'sage']):
                    content.append(e.toxml())
                if tag == 'sage':
                    cells.append(('plain', ''.join(content)))
                    content=[]
                    # Assume <sage>, </sage> block has just one child
                    # AND text is 7-bit ASCII at this point
                    cells.append(('compute', e.firstChild.data))
        if content:
            cells.append(('plain', ''.join(content)))
        return titles[0], graphics, cells


    def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None):
        r"""
        Create a single worksheet from a parsed tex4ht XHTML file.

        INPUT:

        - html_name - file name of HTML file
        - css_name - an associated CSS file
        - nb - a notebook to host worksheet creation temporarily
        - user - the user directory for worksheets in this notebook
        - linkinfo - reserved for linked worksheets

        OUTPUT:

        Returns a worksheet in ``nb``.
        """
        import shutil # file copy() to data directory
        from sagenb.notebook.notebook import Notebook

        title, graphics, cells = self._parse_tex4ht(html_name, linkbase)

        # Link in CSS file as part of HTML version
        # Add to filename list for data directory
        content=[]
        content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
        graphics.append(css_name)

        # Recognize cells, adorn compute cells
        for c in cells:
            if c[0] == 'plain':
                content.append(c[1])
            if c[0] == 'compute':
                content.append('{{{' + c[1] + '}}}')

        # Build a worksheet in nb, and return it
        #   Set title
        #   Place files in data directory
        #   Pack discovered graphics into data directory
        #   Data directory does not exist initially
        #   Side-effect of query is to build it
        W = nb.create_new_worksheet(title, user)
        datadir = W.data_directory()
        for filename in graphics:
            shutil.copy("./"+filename, datadir)
        W.edit_save(''.join(content).encode('ascii', 'xmlcharrefreplace'))
        nb.save_worksheet(W)
        return W


    def _create_single_sws(self, basename):
        r"""
        Creates a single Sage worksheet in a portable sws format from a one-section LaTeX document.

        INPUT:

        - `basename` - a string. This is the basename of the original
        LaTeX input file and the basename of the tex4ht output.
        So, for example, suppose your original file is foo.tex, and
        when processed by tex4ht it produces an HTML/jsMath file called
        foo.html, and an associated CSS file foo.css.  You would provide
        `foo` as the input sting, and would end up creating ``foo.sws``.
        So this routine will create a single worksheet faithfully representing
        the original intent in the LaTeX file and possibly including Sage
        compute cells. This assumes the necessary files are in the current
        working directory.

        OUTPUT:  This routine creates a file  foo.sws  in the current working directory.
        The return value is simply this filename as a string.
        """
        # We make a temporary notebook to work in
        # This is located in $HOME/.sage/temp/hostname/pid/
        # Temporary directory gets deleted automatically (as process ends?)
        from sage.misc.misc import tmp_dir
        from sagenb.notebook.notebook import Notebook
        nbdir = tmp_dir() + 'converter.sagenb'
        nb = Notebook(nbdir)
        W = self._convert_one_file(basename+'.html', basename+'.css', nb, 'admin')
        nb.export_worksheet(W.filename(), basename+'.sws')
        return basename+'.sws'

    def _pure_python(self, basename):
        r"""
        EXPERIMENTAL:
        Build an sws file without any notebook code.
        Assumes just a single file of HTML.
        Edit shebang to just call python, not sage
        """
        import time  # for last change in pickled worksheet info
        import tempfile
        import tarfile
        import cPickle
        import os
        import StringIO

        css_name = basename + '.css'
        html_name = basename + '.html'

        # Break out tex4ht output
        title, graphics, cells = self._parse_tex4ht(html_name, None)

        # Piece back together in worksheet format
        content=[]
        content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
        graphics.append(css_name)

        # Recognize cells, adorn compute cells
        for c in cells:
            if c[0] == 'plain':
                content.append(c[1])
            if c[0] == 'compute':
                content.append('{{{' + c[1] + '}}}')

        # Make a generic worksheet configuration as a Python dictionary
        basic = {
            'name':title,
            'system':'sage',
            'owner':'admin',
            'last_change':('admin', time.time()),
            }

        # Build sws as a tar file, with expected name
        prefix = self._input_dir + 'sage_worksheet/'
        T = tarfile.open(basename + '.sws', 'w:bz2')

        # Pickled configuration file
        fd, configfile =  tempfile.mkstemp()
        config = cPickle.dumps(basic)
        open(configfile, 'w').write(config)
        T.add(configfile, prefix + 'worksheet_conf.pickle')
        os.unlink(configfile)
        os.fdopen(fd,'w').close()

        # Worksheet files, new and old styles
        body = ''.join(content).encode('ascii', 'xmlcharrefreplace')
        fd, wsfile =  tempfile.mkstemp()
        open(wsfile, 'w').write(body)
        T.add(wsfile, prefix + 'worksheet.html')
        os.unlink(wsfile)
        os.fdopen(fd,'w').close()
        #  For older versions of notebook, backward compatible
        #  Just have two extra lines of info in header
        header = [title, '\n', 'system:', basic['system'], '\n']
        header = ''.join(header).encode('ascii', 'xmlcharrefreplace')
        fd_old, oldwsfile =  tempfile.mkstemp()
        open(oldwsfile, 'w').write(header + body)
        T.add(oldwsfile, prefix + 'worksheet.txt')
        os.unlink(oldwsfile)
        os.fdopen(fd_old,'w').close()
        #  End backward compatibility

        # Data files, graphics, css, whatever
        dataprefix = prefix + 'data/'
        for f in graphics:
            T.add(f, dataprefix + f)

        T.close()


    def _create_tar_archive(self, basename):
        # this is all ad-hoc for testing
        # long-term the notebook might be temporary, or not
        # One approach would be a portable container, like sws
        # Other would be to install directly in a user's notebook
        from sagenb.notebook.notebook import Notebook
        nbdir = "/tmp/fcla.sagenb"
        nb=Notebook(nbdir)
        nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True)
        cssfilename = basename + '.css'
        for _,htmlfilename in self._files:
            print "Converting: ", htmlfilename
            self._convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename)
        nb.save() # for good measure
        # Bundle up as an archive
        # Get pathnames right for ez decompression as user
        import tarfile, os
        print "Forming tar archive..."
        T = tarfile.open(basename+'.tar.bz2', 'w:bz2')
        os.chdir(nbdir+'/home')
        T.add('linear')
        T.close()

    def convert(self, dir = None, format = None):
        r"""
        The one public method.
        """
        if not format:
            format = self._likely_format
        if not dir:
            dir = self._input_dir
        # pass a directory to _create_single_sws?
        if format == 'sws':
            self._create_single_sws(self._basename)
        if format == 'tar':
            self._create_tar_archive(self._basename)
        ## Calls to testing routines, not permanent
        if format == 'xml-test':
            print self._parse_tex4ht(self._basename+'.html', self._basename)
        if format == 'pure-python':
            print self._pure_python(self._basename)

############################
# Main
############################
#
# Create converter class
# Call convert()

t2s = TeXtoSWS()
t2s.convert()
## Testing, experimental calls
## t2s.convert(format = 'pure-python')
## t2s.convert(format='xml-test')