Source

sws2tex / htmltable2latex.py

Full commit
###############################################################################
#   Sage: Open Source Mathematical Software
#       Copyright (C) 2009-2010 Wilfried Huss <huss@finanz.math.tugraz.at>
#       Copyright (C) 2009-2010 Robert Marik <marik@mendelu.cz>
#  Distributed under the terms of the GNU General Public License (GPL),
#  version 2 or any later version.  The full text of the GPL is available at:
#                  http://www.gnu.org/licenses/
###############################################################################

r"""
convert HTML tables which are created by the html.table() command into LaTeX syntax
"""

import HTMLParser
import string

class Html_table2Latex_tabular(HTMLParser.HTMLParser):

    def __init__(self, columns = 0, cell_id = None):
        HTMLParser.HTMLParser.__init__(self)
        self.columns = columns
        self.latex = ''                # latex tabular code
        self.table_row = []            # current row of table
        self.rowCount = 0              # row counter
        self.columnCount = 0           # column counter
        self.data = ''
        self.cell_id = cell_id

        # calculate the maximal number
        # of columns in the table
        self.maxColumns = 0
        self.in_math_mode = 0
        self.row_type = None

    def handle_starttag(self, tag, attrs):
        if tag == 'table':
            self.start_table()
        elif tag == 'tr':
            self.start_tr(attrs)
        elif tag == 'th':
            self.start_th(attrs)
        elif tag == 'td':
            self.start_td(attrs)
        elif tag == 'span':
            self.start_span(attrs)
        elif tag == 'img':
            self.start_img(attrs)

    def handle_endtag(self, tag):
        if tag == 'table':
            self.end_table();
        elif tag == 'tr':
            self.end_tr()
        elif tag == 'th':
            self.end_th()
        elif tag == 'td':
            self.end_td()
        elif tag == 'span':
            self.end_span()
        elif tag == 'img':
            self.end_img()

    def start_table(self):
        #self.latex += "\\renewcommand{\\arraystretch}{3}\n"
        #self.latex += "\\setlength{\\tabrowsep}{15pt}\n"
        self.latex += '\\begin{tabular}{%s}' % string.join(['l']*self.columns,'!{\\color{white}\\vline}') + '\n'

    def end_table(self):
        self.latex += '\\end{tabular}\n'

    def start_img(self, attrs):
        try:
            self.data = "\\includegraphics{sage_worksheet/cells/%s/%s}" % (self.cell_id,dict(attrs).get('src')[7:])
        except KeyError:
            pass

    def start_span(self, attrs):
        try:
            attrs = dict(attrs)
            class_type = attrs["class"]
            if class_type == "math":
                self.in_math_mode = True
            self.data += '$\\displaystyle '
            if self.row_type == "header":
                self.data += '\\mathbf{'
        except KeyError:
            pass

    def end_span(self):
        if self.in_math_mode == True:
            if self.row_type == "header":
                self.data += '}' # close mathbf
            self.data += '$' # close mathmode

            self.in_math_mode == False

    def start_tr(self, attrs):
        try:
            attrs = dict(attrs)
            row_class = attrs["class"]
            if row_class == "row-a":
                self.row_type = "a"
            elif row_class == "row-b":
                self.row_type = "b"
        except KeyError:
            self.row_type = None

        self.maxColumns = max(self.maxColumns, self.columnCount)
        self.columnCount = 0

    def start_th(self, attrs):
        self.data = ''

        self.row_type = "header"
        self.data += "\\textcolor{white}{\\textbf{"

    def end_tr(self):
        if self.columnCount > 0:
            if self.rowCount > 0:
                self.latex += r"\\[2ex]" + "\n"

            if self.row_type == "a":
                self.latex += '\\rowcolor[HTML]{F8F8F8}\n'
            elif self.row_type == "b":
                self.latex += '\\rowcolor[HTML]{EFEFEF}\n'
            elif self.row_type == "header":
                self.latex += '\\rowcolor[HTML]{A6BA4E}\n'

            self.latex += string.join(self.table_row, ' & ')
            self.latex += '\n'

            self.table_row = []
        self.rowCount += 1
        self.row_type = None

    def start_td(self, attrs):
        self.data = ''

    def end_td(self):
        self.table_row.append(self.data)
        self.data = ''
        self.columnCount += 1

    def end_th(self):
        self.data += "}}" # close textcolor and textbf
        self.table_row.append(self.data)
        self.data = ''
        self.columnCount += 1

    def handle_data(self, data):
        self.data += data

def parse_table(text, cell_id = None):
    # first pass to count the number of columns
    
    parser = Html_table2Latex_tabular()
    parser.feed(text)

    # second pass to generate the LaTeX code
    parser = Html_table2Latex_tabular(parser.maxColumns, cell_id = cell_id)
    parser.feed(text)

    return parser.latex

preamble = r"""
\documentclass{article}
\usepackage{xcolor,amsmath,graphics,hyperref}
\usepackage{colortbl}
\usepackage[margin=1in]{geometry}
\usepackage[utf8]{inputenc}
\usepackage{amssymb}

\begin{document}
"""

# Test program
if __name__ == '__main__':
    import sys
    if len(sys.argv) == 2:
        in_file = open(sys.argv[1], 'r')
        text = in_file.read()

        f = open('table_test.tex', 'w')
        f.write(preamble)
        f.write(parse_table(text))
        f.write("\\end{document}")
        f.close()

    else:
        print 'Usage: %s filename' % sys.argv[0]