Commits

AnnaPS committed 938d8a1

[wdmmg][s] various fixes to departmental loader

  • Participants
  • Parent commits ca08bf1

Comments (0)

Files changed (1)

wdmmgext/load/departments.py

 import os
 import re
 import sys
+import xlrd
 import datapkg
 from django.template.defaultfilters import slugify
 from pylons import config
                   unicode(slugify(department)), department)
     department_code = department.code
 
-    # Open the CSV workbook
     items = []
-    reader = csv.reader(open(filepath, "rU"))
+    is_csv = True
+    # Some 'csv' files are actually Excel, deal with this.
+    # Skip any blank rows at the beginning of fhe file.
     try:
+        reader = csv.reader(open(filepath, "rU"))
         header = reader.next()
+        while not [word for word in header if 'amount' in word.lower()]:
+            header = reader.next()
     except csv.Error, e:
-        reader = csv.reader(open(filepath, "rU"), dialect='excel')
-        header = reader.next()
-    print header
-    while not [word for word in header if 'amount' in word.lower()]:
-        header = reader.next()
+        print 'CSV error, not opening: %s' % e
+        return 1
+        # In case these spreadsheets don't get fixed, dummy loading code.
+        is_csv = False
+        book = xlrd.open_workbook(filename=filepath)
+        sheet = book.sheet_by_index(0)
+        header = [sheet.cell(0,col).value.strip() for col in range(0,sheet.ncols)]
+        header_row = 0
+        while not [word for word in header if 'amount' in word.lower()]:
+            header_row += 1
+            header = [sheet.cell(header_row,col).value.strip() for col in range(0,sheet.ncols)]
+    # Tidy up and print the header.
     header = [h.lower().strip() for h in header]
     print header
     for row_index, row in enumerate(reader):
+    #for row_index in range(header_row+1,sheet.nrows):
          if commit_every and row_index%commit_every == 0:
              print "Committing before processing row %d" % row_index
              model.Session.commit()
+         #row = [sheet.cell(row_index,col).value for col in range(0,sheet.ncols)]
          row = [unicode(r.decode("mac_roman").strip()) for r in row]
-         if not row or len(row) < 8: # some spreadsheets have short blank rows
-             print 'ROW TOO SHORT: %s' % row
+         # Some spreadsheets (Welsh Office) have incomplete rows. Ignore these.
+         if not row or len(row) < 8: 
+             print 'NOT COMMITTING: TOO SHORT: %s' % row
              continue
-         # Don't assume that ordering or wording is standard. Clean up first.
+         # Don't assume that ordering or wording is standard. 
+         # Report any files that are missing columns.
          header = [h.replace("_", " ") for h in header]
          try:
              dept_family_index = [ i for i, word in enumerate(header) \
              amount_index = [ i for i, word in enumerate(header) \
                             if 'amount' in word][0]
          except ValueError as e:
-             print e, 'Header not found in %s' % filename
+             print e, 'Standard header not found in %s' % filename
              break
          department_family_value = row[dept_family_index]
          entity_value = row[entity_index]
 'HSE': 'Health & Safety Executive',
 'MOD': 'Ministry of Defence',
 'MoJ': 'Ministry of Justice',
+'NIO': 'Northern Ireland Office',
 'NOMS': 'NOMS',
 'NSG': 'National School of Government',
 'OAG': 'Office of the Advocate-General',
     path = os.path.join(config['getdata_cache'], 'departments')
     for spending_file in glob.glob(os.path.join(path, '*.csv') ):
         print "Looking at file %s" % spending_file
-        spending_file = 'Spend-Transactions-SO-OAG-06-Jun-2010.csv'
+        #spending_file = 'Spend-Transactions-SO-OAG-06-Jun-2010.csv'
         department, subunit = get_department(spending_file)
         filepath = os.path.join(path, spending_file)
-        if not ('BIS-UKTI-A-' in spending_file): # Weird formatting.
+        #if not ('BIS-UKTI-A-' in spending_file): # Weird formatting.
              #if not ('SO-OAG-' in spending_file): # 'NULL byte' errors - broken file?
-             load_file(unicode(filepath), unicode(department), unicode(subunit), \
+        load_file(unicode(filepath), unicode(department), unicode(subunit), \
                  commit_every=1000)
     model.Session.commit()
     model.Session.remove()