wdmmg-ext / wdmmgext / load /

import csv
import os

import datapkg
from datautil.tabular import gdocs
from pylons import config

from wdmmg import model
from wdmmg.getdata import util

def drop():
    Drops Keys 'cofog1', 'cofog2' and 'cofog3' and all associated
    EnumerationValues and KeyValues.
    Does not drop Key 'parent'.
    Aborts if Keys are in use (as predicates).
    # Semaphore to avoid corrupting other data sets: check that Keys are not in use.
    for name in [u'cofog3', u'cofog2', u'cofog1']:
        assert (model.Session.query(model.KeyValue)
            .filter( == name)
            ).count() == 0, name
    # Delete Keys.
    for name in [u'cofog3', u'cofog2', u'cofog1']:
        print 'Deleting key', name
        key = model.Session.query(model.Key).filter_by(name=name).first()
        if key:
            key.keyvalues.clear() # TODO: Work out how to make this automatic.

def load():
    Downloads the COFOG list, and loads it into the database with key names
    'cofog1', 'cofog2' and 'cofog3'.

    Typically this is invoked from a paster shell:
      $ paster shell
      >>> from wdmmg.getdata import cofog
      >>> cofog.load()

    The cofog datapkg must be in place.
    # Get the COFOG data package.
    pkgspec = 'file://%s' % os.path.join(config['getdata_cache'], 'cofog')
    pkg = datapkg.load_package(pkgspec)
    fileobj ='cofog.csv')

def dejargonise():
    Downloads alternative names for the COFOG codes, chosen specially for WDMMG,
    and replaces the names in the database. The original names are backed up.
    keys = [model.Session.query(model.Key).filter_by(name=name).one()
        for name in u'cofog1', u'cofog2', u'cofog3']
    # Create the 'official_name' Key if necessary.
    key_official_name = util.get_or_create_key(name=u'official_name', notes=u'''\
The official names of things, as defined by the relevant standards body. We \
do not always use the official names for things, because they are sometimes \
difficult to understand. When we substitute an alternative name, we record \
the official name using this key.''')
    # Loop through the rows of the Google spreadsheet where the data is maintained.
    for row in gdocs.GDocsReaderTextDb(
        'tQSJ9dxTh8AKl-ON4Qqja8Q', # Google spreadsheet key.
#        print row
        code, official_name, alternative_name, notes = row
        ev = (model.Session.query(model.EnumerationValue)
            .filter(model.EnumerationValue.key_id.in_([ for k in keys]))
        assert ev, 'Spreadsheet contains an unknown COFOG code: %r' % row
        if alternative_name:
            print "Setting name of %r to %r" % (code, alternative_name)
            if key_official_name not in ev.keyvalues:
                # Make a backup of the official name.
                # assert == official_name
                ev.keyvalues[key_official_name] =
   = unicode(alternative_name)

def promote_notes():
    Where a level 2 COFOG code has exactly one level 3 sub-code, there is
    often no detailed description for the level 2 code. This method will
    supply the missing description by copying it from the level 3 sub-code.
    key_cofog2 = model.Session.query(model.Key).filter_by(name=u'cofog2').one()
    key_cofog3 = model.Session.query(model.Key).filter_by(name=u'cofog3').one()
    key_parent = model.Session.query(model.Key).filter_by(name=u'parent').one()
    all_cofog2s = (model.Session.query(model.EnumerationValue)
    for ev in all_cofog2s:
        if ev.notes:
        children = (model.Session.query(model.EnumerationValue)
            .filter(model.KeyValue.key == key_parent)
            .filter(model.KeyValue.value == ev.code)
        if len(children)==1 and children[0].notes:
            # Copy `notes` from child to parent.
            print "Copying notes from %s to %s" % (children[0].code, ev.code)
            print "Old notes = ", ev.notes
            ev.notes = children[0].notes
            print "New notes = ", ev.notes

def load_file(fileobj):
    Loads the specified COFOG-like file into the database with key names
    'cofog1', 'cofog2' and 'cofog3'.
    # Semaphore to avoid creating multiple copies.
    assert not model.Session.query(model.Key).filter_by(name=u'cofog1').first(), "COFOG already loaded"
    # Create the 'parent' Key if necessary.
    key_parent = model.Session.query(model.Key).filter_by(name=u'parent').first()
    if not key_parent:
        key_parent = model.Key(name=u'parent', notes=u'Means "is part of".')
    # Create the COFOG Keys.
    key_cofog1 = model.Key(name=u'cofog1', notes=u'Classification Of Function Of Government, level 1')
    key_cofog2 = model.Key(name=u'cofog2', notes=u'Classification Of Function Of Government, level 2')
    key_cofog3 = model.Key(name=u'cofog3', notes=u'Classification Of Function Of Government, level 3')
    model.Session.add_all([key_cofog1, key_cofog2, key_cofog3])
    key_cofog2.keyvalues[key_parent] =
    key_cofog3.keyvalues[key_parent] =
    # For each row in the file...
    reader = csv.reader(fileobj)
    header = # Code, Title, Details, Change date.
    for row_index, cells in enumerate(reader):
        code, title, details, change_date = [unicode(x, 'UTF-8') for x in cells]
        # Create the enumeration values.
        print '%r, %r' % (code, title)
        parts = code.split('.')
        parents = [u'.'.join(parts[:i+1]) for i, _ in enumerate(parts)]
        print parents
        if len(parents)==1:
            print 'Creating level 1 code', parents[0]
            ev = model.EnumerationValue(key=key_cofog1, code=parents[0], name=title, notes=details)
        elif len(parents)==2:
            print 'Creating level 2 code', parents[1]
            ev = model.EnumerationValue(key=key_cofog2, code=parents[1], name=title, notes=details)
            ev.keyvalues[key_parent] = parents[0]
        elif len(parents)==3:
            print 'Creating level 3 code', parents[2]
            ev = model.EnumerationValue(key=key_cofog3, code=parents[2], name=title, notes=details)
            ev.keyvalues[key_parent] = parents[1]
            assert False, code