Source

wdmmgext / wdmmgext / load / departments.py

Full commit
# Script to import departmental spending data
import csv
from datetime import date
import glob
import os
import re
import sys
import xlrd
import datapkg
from django.template.defaultfilters import slugify
from pylons import config
import util
import wdmmg.model as model

dataset_name = u'departments'
dataset_long_name = u'UK central government department spending over 25,000' 
dataset_currency = u'gbp'
dataset_notes = u'UK central government department spending over 25,000'
names_root = util.Node('')

def load_file(filepath, department, subunit, commit_every=None):
    '''
    Loads a file into a dataset with name 'departments'.
    - filepath - name of file to load  
    - department - which department's spending it is
    - subunit - sub-unit within that department
    - commit_every - if not None, call session.model.commit() at the 
        specified frequency.
    '''

    # Semaphore to prevent the data being loaded twice.
    # We check by filename, not dataset as in other loaders.
    filename = unicode(filepath.split("/")[-1])
    key_filename = util.get_or_create_key(name=u'filename', notes=u'''\
         Name of spending data file''')
    filename_exists = (model.Session.query(model.EnumerationValue)
            .filter_by(name=unicode(filename), key=key_filename)
            ).first()
    if filename_exists:
        print 'File %s already loaded' % filename
        return 1

    # Create the dataset or retrieve it, as appropriate.
    dataset_ = util.get_or_create_dataset(dataset_name, dataset_long_name,\
             dataset_currency, dataset_notes)
    # The keys used to classify departmental spending 
    key_from = util.get_or_create_key(name=u'from', notes=u'''\
The entity that the money was paid from.''')
    key_to = util.get_or_create_key(name=u'to', notes=u'''\
The entity that the money was paid to.''')
    key_time = util.get_or_create_key(name=u'time', notes=u'''\
The accounting period in which the spending happened.''')
    row_id = util.get_or_create_key(name=u'row_id', notes=u'''\
Row number within the file.''')
    sub_unit = util.get_or_create_key(name=u'sub_unit', notes=u'''\
Department sub-unit''')
    department_family = util.get_or_create_key(name=u'department_family', notes=u'''\
Departmental family''')
    entity = util.get_or_create_key(name=u'entity', notes=u'''\
Entity''')
    expense_type = util.get_or_create_key(name=u'expense_type', notes=u'''\
Expense type''')
    expense_area = util.get_or_create_key(name=u'expense_area', notes=u'''\
Expense area''')
    transaction_number = util.get_or_create_key(name=u'transaction_number', notes=u'''\
Departmental transaction number''')

    # Add parent keys.
    key_parent = model.Session.query(model.Key).filter_by(name=u'parent').first()
    if not key_parent:
        key_parent = model.Key(name=u'parent', notes=u'Means "is part of".')
        model.Session.add(key_parent)
    expense_area.keyvalues[key_parent] = entity.name
    model.Session.commit()

    # Create key_from, which is the department.
    department = util.get_or_create_value(key_from, \
                  unicode(slugify(department)), department)
    department_code = department.code

    items = []
    is_csv = True
    # Some 'csv' files are actually Excel, deal with this.
    # Skip any blank rows at the beginning of fhe file.
    try:
        reader = csv.reader(open(filepath, "rU"))
        header = reader.next()
        while not [word for word in header if 'amount' in word.lower()]:
            header = reader.next()
    except csv.Error, e:
        print 'CSV error, not opening: %s' % e
        return 1
        # In case these spreadsheets don't get fixed, dummy loading code.
        is_csv = False
        book = xlrd.open_workbook(filename=filepath)
        sheet = book.sheet_by_index(0)
        header = [sheet.cell(0,col).value.strip() for col in range(0,sheet.ncols)]
        header_row = 0
        while not [word for word in header if 'amount' in word.lower()]:
            header_row += 1
            header = [sheet.cell(header_row,col).value.strip() for col in range(0,sheet.ncols)]
    # Tidy up and print the header.
    header = [h.lower() for h in header]
    print header
    for row_index, row in enumerate(reader):
    #for row_index in range(header_row+1,sheet.nrows):
         if commit_every and row_index%commit_every == 0:
             print "Flushing before processing row %d" % row_index
             model.Session.flush()
         #row = [sheet.cell(row_index,col).value for col in range(0,sheet.ncols)]
         row = [unicode(r.decode("mac_roman").strip()) for r in row]
         # Some spreadsheets (Welsh Office) have incomplete rows. Ignore these.
         if not row:
             continue
         # Don't assume that ordering or wording is standard. 
         # Report any files that are missing columns.
         header = [h.replace("_", " ") for h in header]
         try:
             dept_family_index = [ i for i, word in enumerate(header) \
                            if 'family' in word][0]
             entity_index = [ i for i, word in enumerate(header) \
                            if 'entity' in word][0]
             date_index = [ i for i, word in enumerate(header) \
                            if 'date' in word][0]
             expense_type_index = [ i for i, word in enumerate(header) \
                            if 'expense type' in word][0]
             expense_area_index = [ i for i, word in enumerate(header) \
                            if 'expense area' in word][0]
             supplier_index = [ i for i, word in enumerate(header) \
                            if 'supplier' in word][0]
             transaction_index = [ i for i, word in enumerate(header) \
                                 if 'transaction' in word \
                                 or 'transation' in word][0]
             amount_index = [ i for i, word in enumerate(header) \
                            if 'amount' in word][0]
         except IndexError as e:
             print 'WARNING! Standard header not found in %s' % filename
             break
         department_family_value = row[dept_family_index]
         entity_value = row[entity_index]
         date = row[date_index]
         expense_type_value = row[expense_type_index]
         expense_area_value = row[expense_area_index]
         supplier_value = row[supplier_index]
         # Make EnumValue for supplier.
         supplier = util.get_or_create_value(
            key=key_to, code=unicode(slugify(supplier_value)),
            name=supplier_value,
            )
         transaction_number_value = row[transaction_index]
         # Some files have semi-blank rows. Check for transaction
         # number and skip if it's not there. 
         if not transaction_number_value:
             continue
         # Get rid of pound signs.
         amount = util.to_float(row[amount_index].replace(u'\u00A3',''))
         row_id_value = row_index + 1
         # if len(row)>8 and 'DECC' not in filename:
         #     assert not row[8], 'File %s has more than 8 columns \
         #          - investigate!' % filename
         # Make the Entry and its ClassificationItems.
         txn = model.Entry(dataset_=dataset_, amount=amount)
         items = {
             key_from: unicode(department.code),
             key_to: unicode(supplier.code),
             key_time: unicode(date), 
         }
         for key, code in items.items():
             model.Session.add(model.ClassificationItem(
                 entry=txn,
                 value=util.get_or_create_value(key, code)
              ))
         # Put items that we want to have slugified names in here.
         items = {
             department_family: department_family_value,
             expense_type: expense_type_value,
             transaction_number: transaction_number_value,
             entity: entity_value,
             expense_area: expense_area_value,
             row_id: row_id_value,
             key_filename: filename,
             sub_unit: subunit          
         }
         for key, name in items.items():
             if name: # Don't add blank items.
                 model.Session.add(model.ClassificationItem(
                     entry=txn,
                     value=util.get_or_create_value(key, \
                           unicode(slugify(name)), unicode(name))
                  ))
    # Commit at the end of every file.
    if commit_every:
        model.Session.commit()


dept_dictionary = { 
'A': 'Administration',
'AGO': 'Attorney General\'s Office',
'BIS': 'Department for Business, Innovation and Science',
'CLG': 'Department for Communities and Local Government',
'CO': 'Cabinet Office',
'COI': 'Central Office of Information',
'DCMS': 'Department for Culture, Media and Sport',
'DECC': 'Department for Energy & Climate Change',
'DEFRA': 'Department of the Environment, Food & Rural Affairs',
'DFE': 'Department for Education',
'DFT': 'Department for Transport',
'DH': 'Department of Health',
'DWP': 'Department for Work and Pensions',
'DfID': 'Department for International Development',
'GEO': 'Government Equalities Office',
'HMRC': 'HM Revenue & Customs',
'HMT': 'HM Treasury',
'HO': 'Home Office',
'HSE': 'Health & Safety Executive',
'MOD': 'Ministry of Defence',
'MoJ': 'Ministry of Justice',
'NIO': 'Northern Ireland Office',
'NOMS': 'NOMS',
'NSG': 'National School of Government',
'OAG': 'Office of the Advocate-General',
'UKTI-A': 'UK Trade & Industry - Administration',
'UKTI-P': 'UK Trade & Industry - Programme',
'Probation': 'Probation Trusts',
'SO': 'Scotland Office',
'TSol': 'Treasury Solicitor\'s Department',
'WO': 'Wales Office',
}

def get_department(filename):
    parts = filename.split("-")
    parts = parts[2:] # Strip prefix.
    try: 
        department = dept_dictionary[parts[0]]
        dept_subcode = parts[1]
        if dept_subcode.isdigit():
            subunit = "Central department"
        else:
            # Handle UKTI's various subunits as a special case.
            if dept_subcode=="UKTI":
                dept_subcode = parts[1] + "-" + parts[2]
            subunit = dept_dictionary[dept_subcode]
    except: 
        raise RuntimeError('Filename %s not mapped in departmental names' % filename)
    return department, subunit   

def load():
    '''
    Loads each file in the departmental data in turn. 
    Access via the Paste Script comment "paster load departments" 
    '''
    path = os.path.join(config['getdata_cache'], 'departments')
    for spending_file in glob.glob(os.path.join(path, '*.csv') ):
        print "Looking at file %s" % spending_file
        #spending_file = 'Spend-Transactions-SO-OAG-06-Jun-2010.csv'
        department, subunit = get_department(spending_file)
        filepath = os.path.join(path, spending_file)
        load_file(unicode(filepath), unicode(department), unicode(subunit), \
                 commit_every=1000)
    model.Session.commit()
    model.Session.remove()