Source

wdmmg-ext / wdmmgext / load / uganda.py

# Script to import Uganda data

from datetime import datetime
try:
    import json
except ImportError:
    import simplejson as json

from pylons import config
from datautil.tabular import gdocs
import xlrd

from wdmmg.lib import loader, munge
from wdmmg.lib.times import timespan
from wdmmgext.load import util

years = [u'2003-2004', u'2004-2005', u'2005-2006', u'2006-2007']
DATASET_NAME = u'uganda'


def load(is_test=False, *args):
    '''
    Loads the Uganda GDocs files into a dataset.
    '''
    # Make a suitably configured Loader (this also creates the Dataset).
    uganda_loader = loader.Loader(
        DATASET_NAME, ['name'], "Ugandan government spending",
        description=(
            u'Ugandan government and donor spending from 2003-2007, '
            u'supplied by <a href="http://www.publishwhatyoufund.org/">'
            u'PublishWhatYouFund</a>.\n\n'
            u'Source data can be found in the [CKAN data package]'
            u'(http://www.ckan.net/package/ugandabudget)'),
        currency=u'ugx')

    # Legislative oversight and budgeting: a world perspective /
    # Rick Stapenhurst:
    # ... Uganda budget cycle... The fiscal year commences July 1.
    times = []
    for year in years:
        y = int(year[:4])
        times.append(timespan(year, datetime(y, 7, 1), datetime(y + 1, 6, 30)))

    # Retrieve or create the keys used to classify Ugandan spending.
    uganda_loader.create_dimension(
        u'from', 'Paid by',
        u'The entity that the money was paid from.')
    uganda_loader.create_dimension(
        u'to', "Paid to",
        u'The entity that the money was paid to.')
    uganda_loader.create_dimension(
        u'time', "Time",
        u'The accounting period in which the spending happened.')
    uganda_loader.create_dimension(
        u'name', "Name",
        u'Internal dataset name')
    uganda_loader.create_dimension(
        u'cap_or_cur', "Capital or current",
        u'Capital (one-off investment) or Current (on-going running costs)')
    uganda_loader.create_dimension(
        'wage_or_nonwage', "Wage or non-wage",
        u'Wage-related current costs, or non-wage-related current costs.')
    uganda_loader.create_dimension(
        u'uganda_id', "Internal ID", u'Internal spending item ID.')
    uganda_loader.create_dimension(
        u'gou_vote', "GoU vote",
        u'Government of Uganda vote.')
    uganda_loader.create_dimension(u'project', "Project", u'')
    uganda_loader.create_dimension(
        u'mtef_sector', "MTEF Sector",
        u'Medium Term Expenditure Framework Sector.')
    uganda_loader.create_dimension(
        u'mtef_reference', "MTEF Reference",
        u'Medium Term Expenditure Framework Reference.')
    uganda_loader.create_dimension(u'paf', "Poverty Action Fund.", u'')
    uganda_loader.create_dimension(u'swg', "Sector Working Group", u'')
    uganda_loader.create_dimension(
        u'sector_objective', "Sector objective",
        u'Sector objective (more info TBA).')
    uganda_loader.create_dimension(
        u'peap1', 'PEAP Pillar',
        u'Poverty Eradication Action Plan classification level 1')
    uganda_loader.create_dimension(
        u'peap2', 'PEAP Objective',
        u'Poverty Eradication Action Plan classification level 2')
    uganda_loader.create_dimension(
        u'peap3', 'PEAP Area',
        u'Poverty Eradication Action Plan classification level 3')

    # Get header indices for each of the columns we care about.
    def get_header_indices(header):
        try:
            id_idx = util.find_idx(header, 'id', required=False)
            vote_idx = util.find_idx(header, 'gouvote', required=False)
            vote_name_idx = util.find_idx(header, 'votename', required=False)
            proj_code_idx = util.find_idx(header, 'projectcode',
                                          required=False)
            proj_name_idx = util.find_idx(header, 'projectname',
                                          required=False)
            dp_idx = util.find_idx(header, 'dp', required=False)
            gou_idx = util.find_idx(header, 'gou', required=False)
            mtef_sector_idx = util.find_idx(header, 'mtefsector',
                                            required=False)
            mtef_ref_idx = util.find_idx(header, 'mtefreference',
                                         required=False)
            paf_idx = util.find_idx(header, 'paf', required=False)
            swg_idx = util.find_idx(header, 'swg', required=False)
            so_idx = util.find_idx(header, 'sectorobjective', required=False)
            peap1_idx = util.find_idx(header, 'peappillar', required=False)
            peap2_idx = util.find_idx(header, 'peapobjective', required=False)
            peap3_idx = util.find_idx(header, 'peaparea', required=False)
            amount_03_index = util.find_idx(header, 'outturn2003-2004ugxbn',
                                            required=False)
            amount_04_index = util.find_idx(header, 'outturn2004-2005ugxbn',
                                            required=False)
            amount_05_index = util.find_idx(header, 'outturn2005-2006ugxbn',
                                            required=False)
            amount_06_index = util.find_idx(header, 'budget2006-2007ugxbn',
                                            required=False)
        except Exception, e:
            print 'ERROR: ', e
            return 0

        header_indices = {'id_idx': id_idx,
                          'vote_idx': vote_idx,
                          'vote_name_idx': vote_name_idx,
                          'proj_code_idx': proj_code_idx,
                          'proj_name_idx': proj_name_idx,
                          'dp_idx': dp_idx,
                          'gou_idx': gou_idx,
                          'mtef_sector_idx': mtef_sector_idx,
                          'mtef_ref_idx': mtef_ref_idx,
                          'paf_idx': paf_idx,
                          'swg_idx': swg_idx,
                          'so_idx': so_idx,
                          'peap1_idx': peap1_idx,
                          'peap2_idx': peap2_idx,
                          'peap3_idx': peap3_idx,
                          'amount_03_index': amount_03_index,
                          'amount_04_index': amount_04_index,
                          'amount_05_index': amount_05_index,
                          'amount_06_index': amount_06_index}
        return header_indices

    # Make entries for each row.
    def process_row(row, header_indices, who_from, cap_or_cur,
                    wage_or_nonwage, entry_id):
        #print row
        id_value = row[header_indices['id_idx']]
        funders = {"Donors": "01",
                   "Government": "02",
                   "Government (including Budget Support)": "03"}
        funder = uganda_loader.create_entity(name=funders[who_from],
                                             label=who_from)
        # GoU vote. Create this as an Entity. (Also used as value for 'to'.)
        gou_vote_value = row[header_indices['vote_idx']]
        gou_vote_name_value = row[header_indices['vote_name_idx']]
        gou_vote = uganda_loader.create_entity(name=gou_vote_value,
                                               label=gou_vote_name_value)
        # Project details. Create this as an Entity.
        project_code_value = row[header_indices['proj_code_idx']]
        project_name_value = row[header_indices['proj_name_idx']]
        project = uganda_loader.create_classifier(name=project_code_value,
                                                  taxonomy='ugproj',
                                                  label=project_name_value)
        # MTEF/SWF/Sector. Create as Entities.
        mtef_sector_value = row[header_indices['mtef_sector_idx']]
        mtef_sector = uganda_loader.create_classifier(
            name=munge.to_name(mtef_sector_value),
            taxonomy='mtef', label=mtef_sector_value)
        mtef_reference_value = row[header_indices['mtef_ref_idx']]
        mtef_reference = uganda_loader.create_classifier(
            name=munge.to_name(mtef_reference_value),
            taxonomy='mtef', label=mtef_reference_value)
        swg_value = row[header_indices['swg_idx']]
        swg = uganda_loader.create_classifier(name=munge.to_name(swg_value),
            taxonomy='swg', label=swg_value)
        sector_objective_value = row[header_indices['so_idx']]
        sector_objective = uganda_loader.create_classifier(
            name=munge.to_name(sector_objective_value),
            taxonomy='ugsecobj', label=sector_objective_value)

        # Clean up PEAP codes on the fly
        def peap(row_value, parent=None, level=1):
            row_values = row_value.split(" ")
            row_code = row_values[0]
            # Deal with non-numeric values - assign codes of 0.
            if not row_code[0].isdigit():
                if parent is not None:
                    row_code = parent.name + ".0"
                else:
                    row_code = "0"
            else:
                if row_code[-1] == ".":
                    row_code = row_code[:-1]
                row_value = " ".join(row_values[1:])
            return uganda_loader.create_classifier(
                unicode(row_code), 'peap', level=level,
                label=unicode(row_value),
                parent=parent.to_ref() if parent else None)
        peap1 = peap(row[header_indices['peap1_idx']], level=1)
        peap2 = peap(row[header_indices['peap2_idx']], peap1, level=2)
        peap3 = peap(row[header_indices['peap3_idx']], peap2, level=3)
        amounts = [row[header_indices['amount_03_index']], \
                   row[header_indices['amount_04_index']], \
                   row[header_indices['amount_05_index']], \
                   row[header_indices['amount_06_index']]]
        expenditures = [round(1e9 * util.to_float(x)) for x in amounts]

        # Skip row whose expenditures are all zero.
        if not [x for x in expenditures if x]:
            return 1
        for time, exp in zip(times, expenditures):
            if exp:
                entry_id += 1
                print ('Creating an entry with amount %s and ID %s'
                       % (exp, entry_id))
                entry = {
                    'amount': exp,
                    'name': DATASET_NAME + '-r' + str(entry_id),
                    'from': funder,
                    'to': gou_vote,
                    'time': time,
                    'cap_or_cur': cap_or_cur,
                    'wage_or_nonwage': wage_or_nonwage,
                    'uganda_id': id_value
                    }

                #uganda_loader.classify_entry(entry, gou_vote, 'gou_vote')
                uganda_loader.classify_entry(entry, project, 'project')
                uganda_loader.classify_entry(entry, mtef_sector, 'mtef_sector')
                uganda_loader.classify_entry(entry, mtef_reference,
                    'mtef_reference')
                uganda_loader.classify_entry(entry, swg, 'swg')
                uganda_loader.classify_entry(entry, sector_objective,
                    'sector_objective')
                uganda_loader.classify_entry(entry, peap1, 'peap1')
                uganda_loader.classify_entry(entry, peap2, 'peap2')
                uganda_loader.classify_entry(entry, peap3, 'peap3')

                uganda_loader.create_entry(**entry)

    def load_file(who_from, cap_or_cur, wage_or_nonwage, entry_id,
                  sheet_name=None, filename=None):
        # Open the file: first get header indices, then process each row.
        if filename:
            book = xlrd.open_workbook(filename=filename)
            sheet = book.sheet_by_index(0)
            # Get header and indices.
            header = [str(sheet.cell(0, col).value) for
                      col in range(0, sheet.ncols)]
            print header
            header_indices = get_header_indices(header)
            # Process each row.
            for row_num in range(2, sheet.nrows - 1):
                entry_id += 1
                row = [str(sheet.cell(row_num, col).value)
                       for col in range(0, sheet.ncols)]
                row = [unicode(x.strip()) if x is not None else u'' for
                       x in row]
                process_row(row, header_indices, who_from, cap_or_cur,
                            wage_or_nonwage, entry_id)
        else:
            # Get header and indices.
            all_rows = gdocs.GDocsReaderTextDb('t5yiq3m1DbXC3pvB29GCwfg',
                config['gdocs_username'], config['gdocs_password']
            ).read(sheet_name=sheet_name).to_list()
            header = all_rows[0]
            print header
            header_indices = get_header_indices(header)
            # Process each row.
            for row in all_rows[2:]:
                entry_id += 1
                row = [unicode(x.strip()) if x is not None else u'' for
                       x in row]
                process_row(row, header_indices, who_from, cap_or_cur,
                            wage_or_nonwage, entry_id)
        return entry_id

    load_file(who_from=u'Donors', cap_or_cur=u'Capital', entry_id=0,
              wage_or_nonwage=u'N/A (capital spending)',
              sheet_name='Donor Development')
    if not is_test:
        n = load_file(who_from=u'Government',
                      cap_or_cur=u'Capital', entry_id=0,
                      wage_or_nonwage=u'N/A (capital spending)',
                      sheet_name='GoU Development')
        n = load_file(who_from=u'Government (including Budget Support)',
                      entry_id=n, cap_or_cur=u'Current',
                      wage_or_nonwage=u'Wage',
                      sheet_name='GoU Development')
        n = load_file(who_from=u'Government (including Budget Support)',
                      entry_id=n, cap_or_cur=u'Current',
                      wage_or_nonwage=u'Non-wage',
                      sheet_name='GoU Development')

    uganda_loader.compute_aggregates()