wdmmgext / wdmmgext / load /

# Script to import departmental spending data
import csv
from datetime import date
import glob
import os
import re
import sys
import xlrd
import datapkg
from django.template.defaultfilters import slugify
from pylons import config
import util
import wdmmg.model as model

dataset_name = u'departments'
dataset_long_name = u'UK central government department spending over 25,000' 
dataset_currency = u'gbp'
dataset_notes = u'UK central government department spending over 25,000'
names_root = util.Node('')

def load_file(filepath, department, subunit, commit_every=None):
    Loads a file into a dataset with name 'departments'.
    - filepath - name of file to load  
    - department - which department's spending it is
    - subunit - sub-unit within that department
    - commit_every - if not None, call session.model.commit() at the 
        specified frequency.

    # Semaphore to prevent the data being loaded twice.
    # We check by filename, not dataset as in other loaders.
    filename = unicode(filepath.split("/")[-1])
    key_filename = util.get_or_create_key(name=u'filename', notes=u'''\
         Name of spending data file''')
    filename_exists = (model.Session.query(model.EnumerationValue)
            .filter_by(name=filename, key=key_filename)
    if filename_exists:
        print 'File %s already loaded' % filename
        return 1

    # Create the dataset or retrieve it, as appropriate.
    dataset_ = util.get_or_create_dataset(dataset_name, dataset_long_name,\
             dataset_currency, dataset_notes)
    # The keys used to classify departmental spending 
    key_from = util.get_or_create_key(name=u'from', notes=u'''\
The entity that the money was paid from.''')
    key_to = util.get_or_create_key(name=u'to', notes=u'''\
The entity that the money was paid to.''')
    key_time = util.get_or_create_key(name=u'time', notes=u'''\
The accounting period in which the spending happened.''')
    sub_unit = util.get_or_create_key(name=u'sub_unit', notes=u'''\
Department sub-unit''')
    department_family = util.get_or_create_key(name=u'department_family', notes=u'''\
Departmental family''')
    entity = util.get_or_create_key(name=u'entity', notes=u'''\
    expense_type = util.get_or_create_key(name=u'expense_type', notes=u'''\
Expense type''')
    expense_area = util.get_or_create_key(name=u'expense_area', notes=u'''\
Expense area''')
    transaction_number = util.get_or_create_key(name=u'transaction_number', notes=u'''\
Departmental transaction number''')

    # Add parent keys.
    key_parent = model.Session.query(model.Key).filter_by(name=u'parent').first()
    if not key_parent:
        key_parent = model.Key(name=u'parent', notes=u'Means "is part of".')
    expense_area.keyvalues[key_parent] =

    # Create key_from, which is the department.
    department = util.get_or_create_value(key_from, \
                  unicode(slugify(department)), department)
    department_code = department.code

    items = []
    is_csv = True
    # Some 'csv' files are actually Excel, deal with this.
    # Skip any blank rows at the beginning of fhe file.
        reader = csv.reader(open(filepath, "rU"))
        header =
        while not [word for word in header if 'amount' in word.lower()]:
            header =
    except csv.Error, e:
        print 'CSV error, not opening: %s' % e
        return 1
        # In case these spreadsheets don't get fixed, dummy loading code.
        is_csv = False
        book = xlrd.open_workbook(filename=filepath)
        sheet = book.sheet_by_index(0)
        header = [sheet.cell(0,col).value.strip() for col in range(0,sheet.ncols)]
        header_row = 0
        while not [word for word in header if 'amount' in word.lower()]:
            header_row += 1
            header = [sheet.cell(header_row,col).value.strip() for col in range(0,sheet.ncols)]
    # Tidy up and print the header.
    header = [h.lower().strip() for h in header]
    print header
    for row_index, row in enumerate(reader):
    #for row_index in range(header_row+1,sheet.nrows):
         if commit_every and row_index%commit_every == 0:
             print "Committing before processing row %d" % row_index
         #row = [sheet.cell(row_index,col).value for col in range(0,sheet.ncols)]
         row = [unicode(r.decode("mac_roman").strip()) for r in row]
         # Some spreadsheets (Welsh Office) have incomplete rows. Ignore these.
         if not row or len(row) < 8: 
             print 'NOT COMMITTING: TOO SHORT: %s' % row
         # Don't assume that ordering or wording is standard. 
         # Report any files that are missing columns.
         header = [h.replace("_", " ") for h in header]
             dept_family_index = [ i for i, word in enumerate(header) \
                            if 'family' in word][0]
             entity_index = [ i for i, word in enumerate(header) \
                            if 'entity' in word][0]
             date_index = [ i for i, word in enumerate(header) \
                            if 'date' in word][0]
             expense_type_index = [ i for i, word in enumerate(header) \
                            if 'expense type' in word][0]
             expense_area_index = [ i for i, word in enumerate(header) \
                            if 'expense area' in word][0]
             supplier_index = [ i for i, word in enumerate(header) \
                            if 'supplier' in word][0]
             transaction_index = [ i for i, word in enumerate(header) \
                            if 'transaction' or 'transation' in word][0]
             amount_index = [ i for i, word in enumerate(header) \
                            if 'amount' in word][0]
         except ValueError as e:
             print e, 'Standard header not found in %s' % filename
         department_family_value = row[dept_family_index]
         entity_value = row[entity_index]
         date = row[date_index]
         expense_type_value = row[expense_type_index]
         expense_area_value = row[expense_area_index]
         supplier_value = row[supplier_index]
         # Make EnumValue for supplier.
         supplier = util.get_or_create_value(
            key=key_to, code=unicode(slugify(supplier_value)),
         transaction_number_value = row[transaction_index]
         amount = util.to_float(row[amount_index].replace(u'\u00A3',''))
         # if len(row)>8 and 'DECC' not in filename:
         #     assert not row[8], 'File %s has more than 8 columns \
         #          - investigate!' % filename
         # Make the Entry and its ClassificationItems.
         txn = model.Entry(dataset_=dataset_, amount=amount)
         items = {
             key_from: department.code,
             key_to: supplier.code,
             key_time: date, 
         #print 'making transaction'
         for key, code in items.items():
                 value=util.get_or_create_value(key, code)
         items = {
             department_family: department_family_value,
             expense_type: expense_type_value,
             transaction_number: transaction_number_value,
             entity: entity_value,
             expense_area: expense_area_value,
             key_filename: filename, 
             sub_unit: subunit          
         for key, name in items.items():
                 value=util.get_or_create_value(key, unicode(slugify(name)), name)
    if commit_every:

dept_dictionary = { 
'A': 'Administration',
'AGO': 'Attorney General\'s Office',
'BIS': 'Department for Business, Innovation and Science',
'CLG': 'Department for Communities and Local Government',
'CO': 'Cabinet Office',
'COI': 'Central Office of Information',
'DCMS': 'Department for Culture, Media and Sport',
'DECC': 'Department for Energy & Climate Change',
'DEFRA': 'Department of the Environment, Food & Rural Affairs',
'DFE': 'Department for Education',
'DFT': 'Department for Transport',
'DH': 'Department of Health',
'DWP': 'Department for Work and Pensions',
'DfID': 'Department for International Development',
'GEO': 'Government Equalities Office',
'HMRC': 'HM Revenue & Customs',
'HMT': 'HM Treasury',
'HO': 'Home Office',
'HSE': 'Health & Safety Executive',
'MOD': 'Ministry of Defence',
'MoJ': 'Ministry of Justice',
'NIO': 'Northern Ireland Office',
'NSG': 'National School of Government',
'OAG': 'Office of the Advocate-General',
'UKTI-A': 'UK Trade & Industry - Administration',
'UKTI-P': 'UK Trade & Industry - Programme',
'Probation': 'Probation Trusts',
'SO': 'Scotland Office',
'TSol': 'Treasury Solicitor\'s Department',
'WO': 'Wales Office',

def get_department(filename):
    parts = filename.split("-")
    parts = parts[2:] # Strip prefix.
        department = dept_dictionary[parts[0]]
        dept_subcode = parts[1]
        if dept_subcode.isdigit():
            subunit = "Central department"
            # Handle UKTI's various subunits as a special case.
            if dept_subcode=="UKTI":
                dept_subcode = parts[1] + "-" + parts[2]
            subunit = dept_dictionary[dept_subcode]
        raise RuntimeError('Filename %s not mapped in departmental names' % filename)
    return department, subunit   

def load():
    Loads each file in the departmental data in turn. 
    Access via the Paste Script comment "paster load departments" 
    path = os.path.join(config['getdata_cache'], 'departments')
    for spending_file in glob.glob(os.path.join(path, '*.csv') ):
        print "Looking at file %s" % spending_file
        #spending_file = 'Spend-Transactions-SO-OAG-06-Jun-2010.csv'
        department, subunit = get_department(spending_file)
        filepath = os.path.join(path, spending_file)
        #if not ('BIS-UKTI-A-' in spending_file): # Weird formatting.
             #if not ('SO-OAG-' in spending_file): # 'NULL byte' errors - broken file?
        load_file(unicode(filepath), unicode(department), unicode(subunit), \