Commits

Anonymous committed 3cbed0c

Adds digit-col option and correct handling of sparse files which use index attribute.

  • Participants
  • Parent commits b15289e

Comments (0)

Files changed (2)

adu/excelconvert.py

 """
 
 import optparse
+import os
+import re
 import sys
 
 try:
 
 # There are probably other numeric fields
 NUMERIC_ADU_FORMAT = [field for field in ADUFORMAT if field[3] in ('NUMBER',)]
+NS = '{urn:schemas-microsoft-com:office:spreadsheet}'
+CELL_TAG = NS + 'Cell'
+INDEX_ATTRIB = NS + 'Index'
+DATA_TAG = NS + 'Data'
 
 
 def strings_to_ints(adu_record):
             print >> sys.stderr, "%s. Field %s, value %s." % (error, name, adu_record[name])
 
 
-def yield_converted_from_excel_xml(xmlpath, cellmap, extra=None):
+def yield_converted_from_excel_xml(xmlpath, cellmap, extra=None,
+                                   digits_col=None):
+    """Pass a column header as digits_col to strip non-digits from data in
+    that column."""
     # print >> sys.stderr, cellmap, extra
-    ns = '{urn:schemas-microsoft-com:office:spreadsheet}'
+    print >> sys.stderr, "Opening and parsing file."
     doc = X.parse(open(xmlpath))
-    rows = doc.findall('//%sRow' % ns)
-    headerrow = rows[0]
-    celltag = '%sCell/%sData' % (ns, ns)
-    headernames = [x.text for x in headerrow.findall(celltag)]
+    print >> sys.stderr, "Collecting all the rows."
+    rows = doc.findall('//%sRow' % NS)
+    headernames = _get_row_text(rows[0])
     # print >> sys.stderr, headernames
-    for row in rows[1:]:
-        valuedict = dict(zip(headernames,
-                             [x.text for x in row.findall(celltag)]))
-        # print >> sys.stderr, valuedict
-        renamed = dict((k, valuedict.get(v)) for k, v in cellmap.items())
-        # print >> sys.stderr, renamed
-        strings_to_ints(renamed)
-        # print >> sys.stderr, renamed
-        adu = ADURecord()
-        if extra:
-            adu.update(extra)
-        adu.update(renamed)
-        yield adu
+    if digits_col is not None:
+        digits_col_i = _col_name_to_index(headernames, digits_col)
+    print >> sys.stderr, "Converting."
+    for line_num, row in enumerate(rows[1:], 2):
+        try:
+            row_text = _get_row_text(row)
+            missing_end_cols = [''] * (len(headernames) - len(row_text))
+            full_row_text = row_text + missing_end_cols
+            # print >> sys.stderr, full_row_text
+            if digits_col is not None:
+                full_row_text[digits_col_i] = re.sub(r'[^0-9]', '',
+                                                     full_row_text[digits_col_i])
+            # print >> sys.stderr, full_row_text
+            valuedict = dict(zip(headernames, full_row_text))
+            # print >> sys.stderr, valuedict
+            renamed = dict((k, valuedict.get(v)) for k, v in cellmap.items())
+            # print >> sys.stderr, renamed
+            strings_to_ints(renamed)
+            # print >> sys.stderr, renamed
+            adu = ADURecord()
+            if extra:
+                adu.update(extra)
+            adu.update(renamed)
+            # Call to_list() to do validation now instead of waiting to get
+            # errors when writing output. This does mean to_list() is called
+            # twice for each record.
+            adu.to_list()
+            yield adu
+        except Exception, error:
+            print >> sys.stderr, "Error on line %s: %s" % (line_num, error)
+
+
+def _get_row_text(excel_xml_row_elem):
+    """Returns a list of strings. It won't contain empty items for empty cells
+    after the last cell with something in it."""
+    res = []
+    for cell in excel_xml_row_elem.findall(CELL_TAG):
+        # print >> sys.stderr, cell.attrib
+        if INDEX_ATTRIB in cell.attrib:
+            up_to_col = len(res) + 1
+            cell_index = int(cell.attrib[INDEX_ATTRIB])
+            # print >> sys.stderr, ("up to col: %s, cell index: %s" %
+            #                       (up_to_col, cell_index))
+            if cell_index < up_to_col:
+                raise NotImplemented
+            elif cell_index > up_to_col:
+                skip_cols = cell_index - up_to_col
+                # print >> sys.stderr, "Skipping %s columns." % skip_cols
+                res.extend([''] * skip_cols)
+        data = cell.findall(DATA_TAG)[0]
+        res.append(data.text)
+    return res
+
+
+def _col_name_to_index(headers, col_name):
+    for col in enumerate(headers):
+        if col[1] == col_name:
+            return col[0]
+    raise ValueError("Col '%s' not found" % col_name)
 
 
 def main(args):
                       dest='output',
                       default='-',
                       help="where to write the ADU file (default: stdout)")
+    parser.add_option('-d',
+                      '--digit-col',
+                      dest='digit_col',
+                      default=None,
+                      help="original name of column to strip non-digits from")
     opts, args = parser.parse_args(args)
     if not args:
         parser.error('expected an input file')
         output = open(opts.output, 'wb')
     records = []
     for f in args:
-        for record in yield_converted_from_excel_xml(f, cellmap, extra):
+        for record in yield_converted_from_excel_xml(f, cellmap, extra,
+                                                     opts.digit_col):
             records.append(record)
     ADURecord.to_file(records, output)
     output.close()
 
-            
+
 if __name__ == '__main__':
     main(sys.argv[1:])
             print '-------'
         for k, v in sorted(rec.get_summary().iteritems()):
             print "%s: %s" % (k, v)
-    
-
-