Commits

AnnaPS committed daeb588

[wdmmg][xs] various improvements for departmental data loading

Comments (0)

Files changed (1)

wdmmgext/load/departments.py

     # Open the CSV workbook
     items = []
     reader = csv.reader(open(filepath, "rU"))
+    # try:
+    #     for row in reader:
+    #         print 'Row read with success!', row
+    # except csv.Error, e:
+    #     sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
     header = reader.next()
+    # Find the headers, stripping off any extra blank rows at start.
+    while not [word for word in header if 'amount' in word.lower()]:
+        header = reader.next()
     header = [h.lower().strip() for h in header]
     print header
     for row_index, row in enumerate(reader):
              print "Committing before processing row %d" % row_index
              model.Session.commit()
          row = [unicode(r.decode("mac_roman").strip()) for r in row]
-         print row_index, row
-         if not row or len(row) > 8: # some spreadsheets have short blank rows
+         if not row or len(row) < 8: # some spreadsheets have short blank rows
+             print 'ROW TOO SHORT: %s' % row
              continue
-         # Don't assume that ordering or wording is standard. 
+         # Don't assume that ordering or wording is standard. Clean up first.
+         header = [h.replace("_", " ") for h in header]
          try:
              dept_family_index = [ i for i, word in enumerate(header) \
                             if 'family' in word][0]
             )
          transaction_number_value = row[transaction_index]
          amount = util.to_float(row[amount_index].replace(u'\u00A3',''))
-         if len(row)>8 and 'DECC' not in filename:
-             assert not row[8], 'File %s has more than 8 columns \
-                  - investigate!' % filename
+         # if len(row)>8 and 'DECC' not in filename:
+         #     assert not row[8], 'File %s has more than 8 columns \
+         #          - investigate!' % filename
          # Make the Entry and its ClassificationItems.
          txn = model.Entry(dataset_=dataset_, amount=amount)
          items = {
              key_to: supplier.code,
              key_time: date, 
          }
+         print 'making transaction'
          for key, code in items.items():
              model.Session.add(model.ClassificationItem(
                  entry=txn,
     path = os.path.join(config['getdata_cache'], 'departments')
     for spending_file in glob.glob(os.path.join(path, '*.csv') ):
         print "Looking at file %s" % spending_file
+        #spending_file = 'Spend-Transactions-WO-06-Jun-2010.csv'
         department, subunit = get_department(spending_file)
         filepath = os.path.join(path, spending_file)
-        if 'BIS-UKTI-A-' not in spending_file: # These are broken, for the moment.
-            # Ask No 10 to fix them. 
-            load_file(unicode(filepath), unicode(department), unicode(subunit), \
+        if not ('BIS-UKTI-A-' in spending_file): # Weird formatting.
+             if not ('SO-OAG-' in spending_file): # 'NULL byte' errors - broken file?
+                load_file(unicode(filepath), unicode(department), unicode(subunit), \
                  commit_every=1000)
     model.Session.commit()
     model.Session.remove()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.