Source

wdmmgext / wdmmgext / load / departments.py

Full commit
# Script to import departmental spending data
import csv
from datetime import date
import glob
import os
import re
import sys
import datapkg
from django.template.defaultfilters import slugify
from pylons import config
import util
import wdmmg.model as model

dataset_name = u'departments'
dataset_long_name = u'UK central government department spending over 25,000' 
dataset_currency = u'gbp'
dataset_notes = u'UK central government department spending over 25,000'
names_root = util.Node('')

def load_file(filepath, department, subunit, commit_every=None):
    '''
    Loads a file into a dataset with name 'departments'.
    - filepath - name of file to load  
    - department - which department's spending it is
    - subunit - sub-unit within that department
    - commit_every - if not None, call session.model.commit() at the 
        specified frequency.
    '''

    # Semaphore to prevent the data being loaded twice.
    # We check by filename, not dataset as in other loaders.
    filename = unicode(filepath.split("/")[-1])
    key_filename = util.get_or_create_key(name=u'filename', notes=u'''\
         Name of spending data file''')
    filename_exists = (model.Session.query(model.EnumerationValue)
            .filter_by(name=filename, key=key_filename)
            ).first()
    if filename_exists:
        print 'File %s already loaded' % filename
        return 1

    # Create the dataset or retrieve it, as appropriate.
    dataset_ = util.get_or_create_dataset(dataset_name, dataset_long_name,\
             dataset_currency, dataset_notes)
    # The keys used to classify departmental spending 
    key_from = util.get_or_create_key(name=u'from', notes=u'''\
The entity that the money was paid from.''')
    key_to = util.get_or_create_key(name=u'to', notes=u'''\
The entity that the money was paid to.''')
    key_time = util.get_or_create_key(name=u'time', notes=u'''\
The accounting period in which the spending happened.''')
    sub_unit = util.get_or_create_key(name=u'sub_unit', notes=u'''\
Department sub-unit''')
    department_family = util.get_or_create_key(name=u'department_family', notes=u'''\
Departmental family''')
    entity = util.get_or_create_key(name=u'entity', notes=u'''\
Entity''')
    expense_type = util.get_or_create_key(name=u'expense_type', notes=u'''\
Expense type''')
    expense_area = util.get_or_create_key(name=u'expense_area', notes=u'''\
Expense area''')
    transaction_number = util.get_or_create_key(name=u'transaction_number', notes=u'''\
Departmental transaction number''')

    # Add parent keys.
    key_parent = model.Session.query(model.Key).filter_by(name=u'parent').first()
    if not key_parent:
        key_parent = model.Key(name=u'parent', notes=u'Means "is part of".')
        model.Session.add(key_parent)
    expense_area.keyvalues[key_parent] = entity.name
    model.Session.commit()

    # Create key_from, which is the department.
    department = util.get_or_create_value(key_from, \
                  unicode(slugify(department)), department)
    department_code = department.code

    # Open the CSV workbook
    items = []
    reader = csv.reader(open(filepath, "rU"))
    try:
        header = reader.next()
    except csv.Error, e:
        reader = csv.reader(open(filepath, "rU"), dialect='excel')
        header = reader.next()
    print header
    while not [word for word in header if 'amount' in word.lower()]:
        header = reader.next()
    header = [h.lower().strip() for h in header]
    print header
    for row_index, row in enumerate(reader):
         if commit_every and row_index%commit_every == 0:
             print "Committing before processing row %d" % row_index
             model.Session.commit()
         row = [unicode(r.decode("mac_roman").strip()) for r in row]
         if not row or len(row) < 8: # some spreadsheets have short blank rows
             print 'ROW TOO SHORT: %s' % row
             continue
         # Don't assume that ordering or wording is standard. Clean up first.
         header = [h.replace("_", " ") for h in header]
         try:
             dept_family_index = [ i for i, word in enumerate(header) \
                            if 'family' in word][0]
             entity_index = [ i for i, word in enumerate(header) \
                            if 'entity' in word][0]
             date_index = [ i for i, word in enumerate(header) \
                            if 'date' in word][0]
             expense_type_index = [ i for i, word in enumerate(header) \
                            if 'expense type' in word][0]
             expense_area_index = [ i for i, word in enumerate(header) \
                            if 'expense area' in word][0]
             supplier_index = [ i for i, word in enumerate(header) \
                            if 'supplier' in word][0]
             transaction_index = [ i for i, word in enumerate(header) \
                            if 'transaction' or 'transation' in word][0]
             amount_index = [ i for i, word in enumerate(header) \
                            if 'amount' in word][0]
         except ValueError as e:
             print e, 'Header not found in %s' % filename
             break
         department_family_value = row[dept_family_index]
         entity_value = row[entity_index]
         date = row[date_index]
         expense_type_value = row[expense_type_index]
         expense_area_value = row[expense_area_index]
         supplier_value = row[supplier_index]
         # Make EnumValue for supplier.
         supplier = util.get_or_create_value(
            key=key_to, code=unicode(slugify(supplier_value)),
            name=supplier_value,
            )
         transaction_number_value = row[transaction_index]
         amount = util.to_float(row[amount_index].replace(u'\u00A3',''))
         # if len(row)>8 and 'DECC' not in filename:
         #     assert not row[8], 'File %s has more than 8 columns \
         #          - investigate!' % filename
         # Make the Entry and its ClassificationItems.
         txn = model.Entry(dataset_=dataset_, amount=amount)
         items = {
             key_from: department.code,
             key_to: supplier.code,
             key_time: date, 
         }
         #print 'making transaction'
         for key, code in items.items():
             model.Session.add(model.ClassificationItem(
                 entry=txn,
                 value=util.get_or_create_value(key, code)
              ))
         items = {
             department_family: department_family_value,
             expense_type: expense_type_value,
             transaction_number: transaction_number_value,
             entity: entity_value,
             expense_area: expense_area_value,
             key_filename: filename, 
             sub_unit: subunit          
         }
         for key, name in items.items():
             model.Session.add(model.ClassificationItem(
                 entry=txn,
                 value=util.get_or_create_value(key, unicode(slugify(name)), name)
              ))
    if commit_every:
        model.Session.commit()

dept_dictionary = { 
'A': 'Administration',
'AGO': 'Attorney General\'s Office',
'BIS': 'Department for Business, Innovation and Science',
'CLG': 'Department for Communities and Local Government',
'CO': 'Cabinet Office',
'COI': 'Central Office of Information',
'DCMS': 'Department for Culture, Media and Sport',
'DECC': 'Department for Energy & Climate Change',
'DEFRA': 'Department of the Environment, Food & Rural Affairs',
'DFE': 'Department for Education',
'DFT': 'Department for Transport',
'DH': 'Department of Health',
'DWP': 'Department for Work and Pensions',
'DfID': 'Department for International Development',
'GEO': 'Government Equalities Office',
'HMRC': 'HM Revenue & Customs',
'HMT': 'HM Treasury',
'HO': 'Home Office',
'HSE': 'Health & Safety Executive',
'MOD': 'Ministry of Defence',
'MoJ': 'Ministry of Justice',
'NOMS': 'NOMS',
'NSG': 'National School of Government',
'OAG': 'Office of the Advocate-General',
'UKTI-A': 'UK Trade & Industry - Administration',
'UKTI-P': 'UK Trade & Industry - Programme',
'Probation': 'Probation Trusts',
'SO': 'Scotland Office',
'TSol': 'Treasury Solicitor\'s Department',
'WO': 'Wales Office',
}

def get_department(filename):
    parts = filename.split("-")
    parts = parts[2:] # Strip prefix.
    try: 
        department = dept_dictionary[parts[0]]
        dept_subcode = parts[1]
        if dept_subcode.isdigit():
            subunit = "Central department"
        else:
            # Handle UKTI's various subunits as a special case.
            if dept_subcode=="UKTI":
                dept_subcode = parts[1] + "-" + parts[2]
            subunit = dept_dictionary[dept_subcode]
    except: 
        raise RuntimeError('Filename %s not mapped in departmental names' % filename)
    return department, subunit   

def load():
    '''
    Loads each file in the departmental data in turn. 
    Access via the Paste Script comment "paster load departments" 
    '''
    path = os.path.join(config['getdata_cache'], 'departments')
    for spending_file in glob.glob(os.path.join(path, '*.csv') ):
        print "Looking at file %s" % spending_file
        spending_file = 'Spend-Transactions-SO-OAG-06-Jun-2010.csv'
        department, subunit = get_department(spending_file)
        filepath = os.path.join(path, spending_file)
        if not ('BIS-UKTI-A-' in spending_file): # Weird formatting.
             #if not ('SO-OAG-' in spending_file): # 'NULL byte' errors - broken file?
             load_file(unicode(filepath), unicode(department), unicode(subunit), \
                 commit_every=1000)
    model.Session.commit()
    model.Session.remove()