Commits

pudo committed 97cab4b

[load] remove loaders made redundant by csv loader

Comments (0)

Files changed (7)

wdmmgext/extract/de_bund.py

-#coding: utf-8
-import codecs
-import re
-import json
-import logging
-import os
-from csv import DictWriter
-from os.path import join
-from pprint import pprint
-from lxml import html
-from urlparse import urljoin
-
-BASE_URL = "http://www.bundesfinanzministerium.de/bundeshaushalt%s/html/ep00.html"
-
-log = logging.getLogger(__name__)
-
-def clean(elem):
-    return elem.xpath("string()").strip()
-
-def anchors(doc, rfilter):
-    f = re.compile(rfilter)
-    for a in doc.findall('.//a'):
-        href = a.get('href')
-        if href is None:
-            continue
-        match = f.match(href)
-        if match:
-            yield (href, clean(a))
-
-def load_budget(base_url, year):
-    url = base_url % year
-    context = {'data_year': year}
-    rows = []
-    csv_fh = open('bund_%s.csv' % year, 'wb')
-    keys = None
-    csv = None
-    for row in load_einzelplaene(url, context):
-        if keys is None:
-            keys = sorted(row.keys())
-            csv = DictWriter(csv_fh, keys)
-        else:
-            ckeys = sorted(row.keys())
-            assert ckeys == keys, (row.get('url'), keys, ckeys)
-        for k, v in row.items(): 
-            if isinstance(v, unicode):
-                row[k] = v.encode('utf-8')
-        rows.append(row)
-        csv_row = row.copy()
-        del csv_row['commitment_appropriations']
-        del csv_row['remarks']
-        del csv_row['description']
-        csv.writerow(csv_row)
-    csv_fh.close()
-    fh = open('bund_%s.json' % year, 'w')
-    json.dump(rows, fh, encoding='utf-8', indent=2)
-    fh.close()
-
-def load_einzelplaene(url, context):
-    doc = html.parse(url)
-    for (href, label) in anchors(doc, "ep\d{2,}/ep\d{2,}.html"):
-        ep_url = urljoin(url, href)
-        ep_context = context.copy()
-        name = ep_context['ep_id'] = re.match('.*ep(\d*).html', ep_url).group(1)
-        ep_context['ep_url'] = urljoin(url, "../html/ep" + name + "/ep" + name + ".html")
-        ep_context['ep_pdf'] = urljoin(url, "../pdf/epl" + name + ".pdf")
-        ep_context['ep_label'] = label
-        for r in load_kapitel(ep_url, ep_context):
-            yield r
-
-def load_kapitel(url, context):
-    doc = html.parse(url)
-    exp = "ep" + context.get('ep_id') + "kp\d{2,}.html"
-    for (href, label) in anchors(doc, exp):
-        kp_url = urljoin(url, href)
-        name_part = re.match('.*kp(\d*).html', kp_url).group(1)
-        ep_id = context.get('ep_id')
-        kp_context = context.copy()
-        kp_context['kp_label'] = label
-        kp_context['kp_url'] = kp_url = urljoin(url, href) 
-        kp_context['kp_pdf'] = urljoin(url, "../../pdf/epl" + ep_id + "/s" + \
-            ep_id + name_part + ".pdf")
-        kp_context['kp_id'] = ep_id + name_part
-        doc = html.parse(kp_url)
-        exp = ".*kp" + name_part + "nr[ae].*.html"
-        for (href, label) in anchors(doc, exp):
-            group_file = urljoin(kp_url, href)
-            for r in load_titelgruppe(group_file, label, kp_context):
-                yield r
-
-
-def load_titelgruppe(url, label, context):
-    grp_context = context.copy()
-    grp_context['group_label'] = label
-    grp_context['url'] = url
-    doc = html.parse(url)
-    match = re.match('.*nr([ae])(\d*).html', url)
-    grp_context['group_id'] = context.get('kp_id') + "-" + match.group(2)
-    grp_context['flow'] = 'revenue' if match.group(1) == 'e' else 'spending'
-    
-    for row in doc.findall('.//tr'):
-        for r in load_posten_row(row, grp_context):
-            yield r
-
-
-def load_posten_row(row, context):
-    pcontext = context.copy()
-    pcontext['commitment_appropriations'] = {}
-    pcontext['description'] = ''
-    pcontext['remarks'] = []
-    pcontext['pdf'] = ''
-    year = int(context.get('data_year'))
-    entries = []
-    for i, column in enumerate(row.findall('./td')):
-        if i == 0:
-            name = column.xpath("string()")
-            if not len(name):
-                break
-            if 'Tgr' in name:
-                break
-            if name.startswith('F '):
-                pcontext['flexible'] = True
-                name = name[1:]
-            else: 
-                pcontext['flexible'] = False
-            name = [c for c in name if c in '-0123456789']
-            if not len(name) == 9:
-                break
-            name = pcontext.get('kp_id') + "".join(name)
-            pcontext['id'] = name.strip()
-        if i == 1:
-            pcontext['label'] = column.text.strip() if column.text else None
-            section = ""
-            for elem in column:
-                if elem.tag == 'hr': 
-                    parse_section(section, pcontext)
-                    section = ""
-                elif 'title' in elem.keys() and \
-                    elem.get('title').startswith('PDF Dokument'):
-                    pcontext['pdf'] = urljoin(pcontext.get('url'), elem.get('href'))
-                else:
-                    section += html.tostring(elem).strip()
-                    if elem.tail:
-                        section += elem.tail
-            if len(section):
-                parse_section(section, pcontext)
-        if i == 2:
-            entries.append(parse_posten(column, True, year, context))
-        if i == 3:
-            entries.append(parse_posten(column, True, year - 1, context))
-        if i == 4:
-            entries.append(parse_posten(column, False, year - 2, context))
-    if not 'id' in pcontext: 
-        return 
-
-    for entry in entries:
-        e = pcontext.copy()
-        e.update(entry)
-        yield e  
-
-def handle_number(num):
-    try:
-        val = "".join([c for c in num if c in '-0123456789'])
-        val = int(val) * 1000
-        return float(val)
-    except:
-        return 0.0
-
-
-def parse_posten(column, is_projection, year, context):
-    p = {'projection': is_projection, 'year': year}
-    p['amount'] = handle_number(column.text)
-    return p
-        
-
-re_YEAR = re.compile(".*(2\d{3}).*")
-def parse_section(section, context): 
-    if not len(section.strip()): 
-        return
-    doc = html.fragment_fromstring("<span>" + section + "</span>")
-    h4 = doc.findtext('.//h4')
-    assert h4 is not None, section.encode('utf-8')
-    if u'Verpflichtungser' in h4:
-        capps = {}
-        for p in doc.findall('.//p'):
-            if u'davon fällig' in p.text: 
-                continue
-            year = re_YEAR.match(p.text)
-            if year is not None:
-                span = p.find('./span')
-                capps[int(year.group(1))] = handle_number(span.text)
-        context['commitment_appropriations'] = capps
-    elif u'Erläuterungen' in h4:
-        context['description'] = section
-    elif u'Haushaltsvermerk' in h4:
-        for tr in doc.findall('.//tr'):
-            tds = tr.findall('./td')
-            assert len(tds)==2
-            context['remarks'].append(tds[1].xpath("string()").strip())
-
-
-if __name__ == '__main__': 
-    for year in [2005, 2006, 2007, 2008, 2009, 2010, 2011]:
-        load_budget(BASE_URL, year)
Add a comment to this file

wdmmgext/load/de/__init__.py

Empty file removed.

wdmmgext/load/de/bund_budget.py

-# coding: utf-8
-
-import os
-import json
-
-import datapkg
-from pylons import config
-
-from wdmmg.model import Classifier, Entry, Dataset
-from wdmmg.lib import times, munge
-from wdmmg.lib.loader import Loader
-
-from core_taxonomies import FPL_TAXONOMY, GPL_TAXONOMY
-
-DATASET_NAME = u'bund'
-BUND_TAXONOMY = u'bund'
-
-EP_COLORS = {
-    "01": "#CA221D",
-    "02": "#CA221D",
-    "03": "#CA221D",
-    "04": "#CA221D",
-    "05": "#C22769",
-    "06": "#3F93E1",
-    "07": "#481B79",
-    "08": "#6AAC32",
-    "09": "#42928F",
-    "10": "#D32645",
-    "11": "#CD531C",
-    "12": "#EDC92D",
-    "14": "#A5B425",
-    "15": "#211D79",
-    "16": "#449256",
-    "17": "#7A2077",
-    "19": "#CA221D",
-    "20": "#CA221D",
-    "23": "#E29826",
-    "30": "#44913D",
-    "32": "#2458A3",
-    "33": "#2458A3",
-    "60": "#14388C"
-}
-
-
-def load_entries(loader, filename):
-    fh = open(filename, 'rb')
-    data = json.load(fh)
-    fh.close()
-    
-    for entry in data: 
-        p = 'p' if entry.get('projection') else ''
-        entry['name'] = munge.to_name("bund-%s-%s%s" % (
-            entry.get('id'), entry.get('year'), p))
-        entry['to'] = loader.society
-        einzelplan = loader.create_classifier(name=entry.get('ep_id'), 
-                                         taxonomy=BUND_TAXONOMY,
-                                         label=entry.get('ep_label'),
-                                         url=entry.get('ep_url'),
-                                         level=1,
-                                         color=EP_COLORS.get(entry.get('ep_id')),
-                                         pdf=entry.get('ep_pdf'))
-
-        from_ = loader.create_entity(name=munge.to_name("DE-" + entry.get('ep_label')), 
-                                     label=entry.get('ep_label'),
-                                     url=entry.get('ep_url'),
-                                     ep_id=entry.get('ep_id'),
-                                     color=EP_COLORS.get(entry.get('ep_id')),
-                                     pdf=entry.get('ep_pdf'))
-        entry['from'] = from_
-        del entry['ep_id']
-        del entry['ep_label']
-        del entry['ep_url']
-        del entry['ep_pdf']
-        loader.classify_entry(entry, einzelplan, 'einzelplan')
-        
-        kapitel = loader.create_classifier(name=entry.get('kp_id'), 
-                                           taxonomy=BUND_TAXONOMY,
-                                           parent=einzelplan.to_ref(),
-                                           level=2,
-                                           label=entry.get('kp_label'),
-                                           url=entry.get('kp_url'),
-                                           pdf=entry.get('kp_pdf'))
-        del entry['kp_id']
-        del entry['kp_label']
-        del entry['kp_url']
-        del entry['kp_pdf']
-        loader.classify_entry(entry, kapitel, 'kapitel')
-        
-        group = loader.create_classifier(name=entry.get('group_id'), 
-                                         taxonomy=BUND_TAXONOMY,
-                                         parent=kapitel.to_ref(),
-                                         level=3,
-                                         label=entry.get('group_label'))
-        del entry['group_id']
-        del entry['group_label']
-        loader.classify_entry(entry, group, 'titelgruppe')
-
-        titel = loader.create_classifier(name=entry.get('id'), 
-                                         taxonomy=BUND_TAXONOMY,
-                                         parent=kapitel.to_ref(),
-                                         label=entry.get('label'),
-                                         description=entry.get('description'),
-                                         level=4,
-                                         url=entry.get('url'),
-                                         pdf=entry.get('pdf'),
-                                         flow=entry.get('flow'),
-                                         flexible=entry.get('flexible'),
-                                         remarks=entry.get('remarks'),
-                                         commitment_appropriations=entry.get('commitment_appropriations'))
-        
-        id = entry['id']
-        loader.classify_entry(entry, loader.get_classifier(id[10:11] + '00', FPL_TAXONOMY), 
-                'hauptfunktion')
-        loader.classify_entry(entry, loader.get_classifier(id[10:12] + '0', FPL_TAXONOMY), 
-                'oberfunktion')
-        loader.classify_entry(entry, loader.get_classifier(id[10:13], FPL_TAXONOMY), 
-                'funktion')
-        
-        loader.classify_entry(entry, loader.get_classifier(id[4:5] + '00', GPL_TAXONOMY), 
-                'hauptgruppe')
-        loader.classify_entry(entry, loader.get_classifier(id[4:6] + '0', GPL_TAXONOMY), 
-                'obergruppe')
-        loader.classify_entry(entry, loader.get_classifier(id[4:7], GPL_TAXONOMY), 
-                'gruppe')
-        
-        del entry['id']
-        del entry['label']
-        del entry['description']
-        del entry['url']
-        del entry['pdf']
-        del entry['remarks']
-        del entry['commitment_appropriations']
-        loader.classify_entry(entry, titel, 'titel')
-        
-        entry['time'] = times.for_year(entry.get('year'))
-        del entry['year']
-
-        if entry.get('flow') == 'revenue':
-            entry['from'], entry['to'] = entry['to'], entry['from']
-        
-        entry = dict([(str(k), v) for k, v in entry.items()])
-        q = {'projection': entry.get('projection'), 
-             'titel.name': entry.get('titel').get('name'), 
-             'time.unparsed': entry.get('time').get('unparsed')}
-        loader.create_entry(q, **entry)
-
-
-def make_loader():
-    loader = Loader(DATASET_NAME, "German Bundeshaushalt",
-        currency='EUR',
-        description=u'''The German federal budget, as scraped from the finance ministry's web site. 
-
-Source data is available as a [CKAN package](http://ckan.net/package/de-budget).''')
-    return loader
-
-
-def load():
-    '''
-    Usually access via the Paste Script comment "paster load bund", see
-    ../lib/cli.py .
-    '''
-    # Get the CRA data package.
-    pkgspec = 'file://%s' % os.path.join(config['getdata_cache'], 'de-budget')
-    datapkg.load_package(pkgspec)
-    
-    assert Classifier.find_one({'taxonomy': FPL_TAXONOMY}), 'Funktionenplan must be loaded first'
-    assert Classifier.find_one({'taxonomy': GPL_TAXONOMY}), 'Gruppierungsplan must be loaded first'
-    
-    # Make a suitably configured Loader (this also creates the Dataset).
-    loader = make_loader()
-
-    def describe_dimension(key, label, description=None):
-        loader.create_dimension(key, label, description=description)
-
-    # Retrieve or create the required Keys.
-    describe_dimension(u'flow', 'Flow of spending', description=u'''Whether this is income or spending.''')
-    describe_dimension(u'einzelplan', 'Einzelplan', description='')
-    describe_dimension(u'kapitel', 'Kapitel', description='')
-    describe_dimension(u'titelgruppe', 'Titelgruppe', description='')
-    describe_dimension(u'titel', 'Titel', description=u'''Unterste Ebene der
-Klassifikation''')
-    describe_dimension(u'obergruppe', 'Obergruppe', 
-            description=u'''Gruppierungsplan, Ebene 1''')
-    describe_dimension(u'hauptgruppe', 'Hauptgruppe',
-            description=u'''Gruppierungsplan, Ebene 2''')
-    describe_dimension(u'gruppe', 'Gruppe', 
-            description=u'''Gruppierungsplan, Ebene 3''')
-    describe_dimension(u'oberfunktion', 'Oberfunktion', 
-            description=u'''Funktionenplan, Ebene 1''')
-    describe_dimension(u'hauptfunktion', 'Hauptfunktion',
-            description=u'''Funktionenplan, Ebene 2''')
-    describe_dimension(u'funktion', 'Funktion', 
-            description=u'''Funktionenplan, Ebene 3''')
-
-    describe_dimension(u'projection', 'Projektion', description=u'''Projizierte oder
-tatsächliche Ausgaben/Einnahmen.''')
-
-    describe_dimension(u'data_year', 'Quelljahr der Angabe', description=u'''Aus
-welchem Haushaltsdokument stammt dieser Betrag.''')
-    describe_dimension(u'flexible', 'Flexibilisierter Titel', description=u'''
-Gegenstand der flexibilisierten Haushaltsführung''')
-
-    # Do we need big_society as well? 
-    loader.society = loader.create_entity(name=u'de-society', 
-        label=u'Die Öffentlichkeit',
-        description=u'''Der Empfänger nicht weiter aufgeschlüsselter Ausgaben des Staates.'''
-    )
-
-    for year in [2005, 2006, 2007, 2008, 2009, 2010, 2011]:
-        filename = os.path.join(config['getdata_cache'] +
-                '/de-budget/bund_%s.json' % year)
-        load_entries(loader, filename)
-    aggregate(loader)
-
-    # Finish off.
-def aggregate(loader=None):
-    if loader is None:
-        loader = make_loader()
-
-    loader.flush_aggregates()
-    loader.create_view(Dataset, {'name': DATASET_NAME}, name='default', 
-                       label="Ausgaben nach Einzelplan", 
-                       dimension="dataset", breakdown="einzelplan",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Dataset, {'name': DATASET_NAME}, name='funktion', 
-                       label="Ausgaben nach Funktion", 
-                       dimension="dataset", breakdown="hauptfunktion",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Dataset, {'name': DATASET_NAME}, name='gruppierung', 
-                       label="Ausgaben nach Gruppierung", 
-                       dimension="dataset", breakdown="hauptgruppe",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Dataset, {'name': DATASET_NAME}, name='einnahmen', 
-                       label="Einnahmen", 
-                       dimension="dataset", breakdown="titel",
-                       view_filters={'projection': True, 'flow': 'revenue'})
-    loader.create_view(Classifier, {'taxonomy': BUND_TAXONOMY, 'level': 1}, name='default', 
-                       label="Ausgaben nach Kapitel", 
-                       dimension="einzelplan", breakdown="kapitel",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': BUND_TAXONOMY, 'level': 2},
-                       name='titelgruppe', 
-                       label="Ausgaben pro Titelgruppe", 
-                       dimension="kapitel", breakdown="titelgruppe",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': BUND_TAXONOMY, 'level': 3}, name='default', 
-                       label="Ausgaben pro Titel", 
-                       dimension="titelgruppe", breakdown="titel",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': BUND_TAXONOMY, 'level': 2}, name='default', 
-                       label="Ausgaben pro Titel", 
-                       dimension="kapitel", breakdown="titel",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': BUND_TAXONOMY, 'level': 3}, name='default', 
-                       label="Ausgaben pro Titel", 
-                       dimension="titelgruppe", breakdown="titel",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': FPL_TAXONOMY, 'level': 1}, name='default', 
-                       label="Ausgaben nach Oberfunktion", 
-                       dimension="hauptfunktion", breakdown="oberfunktion",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': FPL_TAXONOMY, 'level': 2}, name='default', 
-                       label="Ausgaben nach Funktion", 
-                       dimension="oberfunktion", breakdown="funktion",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': FPL_TAXONOMY, 'level': 1}, name='default', 
-                       label="Ausgaben nach Obergruppe", 
-                       dimension="hauptgruppe", breakdown="obergruppe",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.create_view(Classifier, {'taxonomy': FPL_TAXONOMY, 'level': 2}, name='default', 
-                       label="Ausgaben nach Gruppe", 
-                       dimension="obergruppe", breakdown="gruppe",
-                       view_filters={'projection': True, 'flow': 'spending'})
-    loader.compute_aggregates()
-
-
-
-
-
-
-
-
-

wdmmgext/load/de/core_taxonomies.py

-import os, csv
-
-import datapkg
-from swiss.tabular import gdocs
-from pylons import config
-
-from wdmmg import model
-from wdmmg.model import Classifier, KeyMeta, Entry
-
-FPL_TAXONOMY = 'funktionenplan'
-GPL_TAXONOMY = 'gruppierungsplan'
-
-def make_classifier(name, taxonomy, label, parent, color, description, source, level, _cache={}, **kwargs):
-    kwargs['label'] = label
-    kwargs['level'] = level
-    if color is not None and len(color.strip()) > 3:
-        kwargs['color'] = color.strip()
-    if parent is not None:
-        kwargs['parent'] = _cache[(taxonomy, parent)].to_ref()
-    q = {'name': name, 'taxonomy': taxonomy}
-    Classifier.c.update(q, {"$set": kwargs}, upsert=True)
-    _cache[(taxonomy, name)] = Classifier.find_one(q)
-
-def load_gdocs_plan(taxonomy_name, gdocs_id):
-    Classifier.c.remove({'taxonomy': taxonomy_name})
-    row_id = 1
-    for row in gdocs.GDocsReaderTextDb(gdocs_id,
-        config['gdocs_username'],
-        config['gdocs_password']
-        ).read().to_list()[1:]:
-        row_id += 1
-        id, name, color, description = [unicode(c) for c in row]
-        if len(id) == 1: id = id + '00'
-        if len(id) == 2: id = id + '0'
-        iid = int(id)
-        level = 3
-        parent = id[:2] + '0'
-        if iid % 100 == 0: 
-            level = 1
-            parent = None
-        elif iid % 10 == 0:
-            level = 2
-            parent = id[:1] + '00'
-        source = 'gdocs:' + gdocs_id + ':' + str(row_id)
-        make_classifier(id, taxonomy_name, name, parent, color, description, source, level)
-
-def load_funktionenplan():
-    load_gdocs_plan(FPL_TAXONOMY, 'tljXl8WXu4oyTRCJ7YIaQDg')
-
-def load_gruppierungsplan():
-    load_gdocs_plan(GPL_TAXONOMY, 'tNbpClFqMRHmPwVwHajj4DA')
-
-
-

wdmmgext/load/departments.py

-# Import all departmental spending files, one at a time.
-# Identify department and subunit (if any) from filename.
-import csv
-from csv import Error as CsvError
-from datetime import date, datetime
-import glob
-import logging
-import os
-import re
-import sys
-import time
-import xlrd
-import urllib2
-import util
-
-from pylons import config
-from swiss.date import parse as parse_date
-
-import wdmmg.model as model
-from wdmmg.model import Dataset, Entry, Entity, Classifier
-from wdmmg.lib import times
-from wdmmg.lib.loader import Loader
-
-log = logging.getLogger()
-
-DATASET_NAME = u'departments'
-SCHEME = u'departments'
-
-dataset_label = u'UK central government spending'
-dataset_currency = u'gbp'
-dataset_description = u'UK central government department spending over 25,000'
-names_root = util.Node('')
-
-_slugify_strip_re = re.compile(r'[^\w\s-]')
-_slugify_hyphenate_re = re.compile(r'[-\s]+')
-
-def slugify(value):
-    """
-    Normalizes string, converts to lowercase, removes non-alpha characters,
-    and converts spaces to hyphens.
-
-    From Django's "django/template/defaultfilters.py".
-    """
-    import unicodedata
-    if not isinstance(value, unicode):
-        value = unicode(value)
-    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
-    value = unicode(_slugify_strip_re.sub('', value).strip().lower())
-    return _slugify_hyphenate_re.sub('-', value)
-
-
-def load_file(department_loader, filepath, department, subunit):
-    '''
-    Loads a file into a dataset with name 'departments'.
-    - filepath - name of file to load
-    - department - which department's spending it is
-    - subunit - sub-unit within that department
-    '''
-    def describe_dimension(key, label, description=None):
-        department_loader.create_dimension(key, label, description=description)
-
-    # Semaphore to prevent the data being loaded twice.
-    # We check by filename, not dataset as in other loaders.
-    filename = unicode(filepath.split("/")[-1])
-    describe_dimension(u'filename', 'Filename', description=u'''\
-Name of data spending file.''')
-
-    describe_dimension(u'from', 'Paid by', description=u'''\
-The entity that the money was paid from.''')
-    describe_dimension(u'to', "Paid to", description=u'''\
-The entity that the money was paid to.''')
-    describe_dimension(u'time', "Time", description=u'''\
-The accounting period in which the spending happened.''')
-    describe_dimension(u'row_id', "Row ID", description=u'''\
-Row number within the file.''')
-    describe_dimension(u'sub_unit', "Sub-unit", description=u'''\
-Department sub-unit.''')
-    describe_dimension(u'dept_family', "Dept family", description=u'''\
-Department family.''')
-    describe_dimension(u'dept_entity', "Dept entity", description=u'''\
-Department entity.''')
-    describe_dimension(u'expense_type', "Expense type", description=u'''\
-Expense type.''')
-    describe_dimension(u'expense_area', "Expense area", description=u'''\
-Expense area.''')
-    describe_dimension(u'transaction_number', "Transaction number", description=u'''\
-Departmental transaction number.''')
-    describe_dimension(u'description', "Internal description", description=u'''\
-Internal description.''')
-    describe_dimension(u'vat_number', "VAT number", description=u'''\
-VAT registration number.''')
-    describe_dimension(u'notes', "Notes", description=u'''\
-Any additional notes from the department.''')
-
-    department_entity = department_loader.create_entity(
-        name=unicode(slugify(department)),
-        label=department,
-        description=department
-        )
-
-
-    # Some 'csv' files are actually Excel, deal with this.
-    # Skip any blank rows at the beginning of fhe file.
-    try:
-        reader = csv.reader(open(filepath, "rU"))
-        header = reader.next()
-        while not [word for word in header if 'amount' in word.lower()]:
-            header = reader.next()
-    except csv.Error, e:
-        msg = 'CSV error, not opening: %s' % e
-        # In case these spreadsheets don't get fixed, dummy loading code.
-        is_csv = False
-        book = xlrd.open_workbook(filename=filepath)
-        sheet = book.sheet_by_index(0)
-        header = [sheet.cell(0,col).value.strip() for col in range(0,sheet.ncols)]
-        header_row = 0
-        while not [word for word in header if 'amount' in word.lower()]:
-            header_row += 1
-            header = [sheet.cell(header_row,col).value.strip() for col in range(0,sheet.ncols)]
-            
-        def worksheet_iterator(sheet, header_row):
-            '''Iterate over the remaining rows in the sheet with
-            a csv-reader like interface'''
-            for row in range(header_row, sheet.nrows):
-                yield sheet.row_values(row)
-
-        reader = worksheet_iterator(sheet, header_row)
-    # Tidy up and print the header.
-    header = [h.lower() for h in header]
-    header = [h.replace("_", " ") for h in header]
-    def find_idx(matchword, required=False):
-        idxlist = [ i for i, word in enumerate(header) if matchword in word]
-        if not idxlist:
-            if required:
-                msg = '!! WARNING! Standard header %s not found in %s. Skipping file.' % (
-                        matchword, filename)
-                raise Exception(msg)
-            else:
-                return None
-        return idxlist[0]
-    try:
-        date_index = find_idx('date', required=True)
-        supplier_index = find_idx('supplier', required=True)
-        amount_index = find_idx('amount', required=True)
-    except Exception, e:
-        return 0
-    # TODO: tidy this up.
-    try:
-        dept_family_index = find_idx('family',required=False)
-    except IndexError, e:
-        dept_family_index = None
-    try:
-        entity_index = find_idx('entity',required=False)
-    except IndexError, e:
-        entity_index = None
-    try:
-        description_index = [ i for i, word in enumerate(header) \
-                       if 'description' in word \
-                       or 'narrative' in word][0]
-    except IndexError, e:
-        description_index = None
-    try:
-        vat_index = find_idx('vat',required=False)
-    except IndexError, e:
-        vat_index = None
-    try:
-        transaction_index = [ i for i, word in enumerate(header) \
-                       if 'transaction' in word \
-                       or 'transation' in word][0]
-    except IndexError, e:
-        transaction_index = None
-    try:
-        expense_type_index = find_idx('expense type',required=False)
-    except IndexError, e:
-        expense_type_index = None
-    try:
-        expense_area_index = find_idx('expense area',required=False)
-    except IndexError, e:
-        expense_area_index = None
-    count = 0
-    try:
-        for row_index, row in enumerate(reader):
-        #for row_index in range(header_row+1,sheet.nrows):
-             #row = [sheet.cell(row_index,col).value for col in range(0,sheet.ncols)]
-             row = [unicode(r.decode("mac_roman").strip()) for r in row]
-             #print row
-             if not row:
-                 continue
-             # Don't assume that ordering or wording is standard.
-             # Report any files that are missing standard columns. Skip blank columns.
-             if (len(row) < 3) or (not row[0] and not row[1]):
-                 continue
-             try:
-                 date = row[date_index]
-                 supplier_value = row[supplier_index]
-                 try:
-                     amount = util.to_float(row[amount_index].replace(u'\u00A3',''))
-                 except ValueError:
-                     log.warn('WARNING! Row missing standard entry in %s' %
-                              filename)
-                     continue
-                 # Make Entity for supplier.
-                 supplier_entity = department_loader.create_entity(name=unicode(slugify(supplier_value)),
-                    label = supplier_value,
-                    description = supplier_value
-                    )
-                 if not supplier_value and date and amount:
-                     continue
-             except IndexError:
-                 log.warn('WARNING! Row missing standard entry in %s' % filename)
-                 continue
-             # Convert date from Excel serial format if necessary.
-             if date.isdigit():
-                 date = xlrd.xldate_as_tuple(int(date),0)
-                 date = '%s/%s/%s' % (date[2], date[1], date[0])
-
-             date_parsed = parse_date(date)
-             if date_parsed is None:
-                #print "DATE SKIP -> ", row
-                # TODO: rather try:
-                #row_time {"unparsed": date}
-                continue
-             else:
-                row_time = times.timespan(date, date_parsed.as_datetime(), date_parsed.as_datetime())
-
-             # Also load optional columns if present. TODO: tidy this up.
-             if description_index is not None and len(row)>description_index:
-                 description_value = row[description_index]
-             else:
-                 description_value = None
-             if vat_index is not None:
-                 try:
-                     vat_number_value = row[vat_index]
-                 except IndexError:
-                     vat_number_value = None
-             else:
-                 vat_number_value = None
-             if dept_family_index is not None:
-                 department_family_value = row[dept_family_index]
-             else:
-                 department_family_value = None
-             if entity_index is not None:
-                 entity_value = row[entity_index]
-             else:
-                 entity_value = None
-             if expense_type_index is not None:
-                 expense_type_value = row[expense_type_index]
-             else:
-                 expense_type_value = None
-             if expense_area_index is not None:
-                 expense_area_value = row[expense_area_index]
-             else:
-                 expense_area_value = None
-             if transaction_index is not None:
-                 transaction_number_value = row[transaction_index]
-             else:
-                 transaction_number_value = None
-             row_id_value = row_index + 1
-
-             ex = {
-                 'name': DATASET_NAME + '-' + filename.replace('.','-') + '-r' + str(row_id_value),
-                 'amount': amount,
-                 'from': department_entity,
-                 'to': supplier_entity,
-                 'time': row_time,
-                 'department_family': department_family_value,
-                 'transaction_number': transaction_number_value,
-                 'dept_entity': entity_value,
-                 'row_id': row_id_value,
-                 'filename': filename,
-                 'sub_unit': subunit,
-                 'vat_number': vat_number_value,
-                 'notes': description_value,
-                 'expense_type': expense_type_value,
-                 'expense_area': expense_area_value
-                 }
-
-             # TODO: Generate proper query filter
-             department_loader.create_entry(**ex)
-             count = count + 1
-    except CsvError:
-        raise
-    return count
-
-
-
-
-
-
-dept_dictionary = {
-'A': 'Administration',
-'AGO': 'Attorney General\'s Office',
-'BIS': 'Department for Business, Innovation and Skills',
-'CLG': 'Department for Communities and Local Government',
-'CO': 'Cabinet Office',
-'COI': 'Central Office of Information',
-'CPS': 'Crown Prosecution Service',
-'DCMS': 'Department for Culture, Media and Sport',
-'DECC': 'Department for Energy & Climate Change',
-'DEFRA': 'Department of the Environment, Food & Rural Affairs',
-'DFE': 'Department for Education',
-'DFT': 'Department for Transport',
-'DH': 'Department of Health',
-'DWP': 'Department for Work and Pensions',
-'DfID': 'Department for International Development',
-'EHRC': 'Equalities and Human Rights Commission',
-'FCO': 'Foreign & Commonwealth Office',
-'GEO': 'Government Equalities Office',
-'HMRC': 'HM Revenue & Customs',
-'HMT': 'HM Treasury',
-'HO': 'Home Office',
-'HSE': 'Health & Safety Executive',
-'MOD': 'Ministry of Defence',
-'MoJ': 'Ministry of Justice',
-'NIO': 'Northern Ireland Office',
-'NOMS': 'National Offender Management Service',
-'NSG': 'National School of Government',
-'OAG': 'Office of the Advocate-General',
-'UKTI-A': 'UK Trade & Industry - Administration',
-'UKTI-P': 'UK Trade & Industry - Programme',
-'Probation': 'Probation Trusts',
-'SO': 'Scotland Office',
-'TSol': 'Treasury Solicitor\'s Department',
-'WO': 'Wales Office',
-}
-
-def not_duplicate(filename, spending_files):
-    '''
-    Some departments have issued sheets with and without descriptive notes.
-    We should always use the versions with descriptive notes, if they exist.
-    For a given file, check whether there is another, more descriptive versions.
-    '''
-    if 'with-descriptions-' in filename:
-        return True
-    if "Spend-Transactions-" in filename:
-        test_filename = filename.replace(
-            "Spend-Transactions-",
-            "Spend-Transactions-with-descriptions-")
-        if test_filename in spending_files:
-            return False
-    return True
-
-def get_department(filepath):
-    '''
-    Use filename codes to find the full department name.
-    '''
-    filename = os.path.basename(filepath)
-    central_department = "Central department"
-
-    # Filenames beginning with "Spend-Transactions-"
-    if "Spend-Transactions-" in filename:
-        filename = filename.replace("Spend-Transactions-with-descriptions-",
-                                    "")
-        filename = filename.replace("Spend-Transactions-", "")
-        parts = filename.split("-")
-        try:
-            dept_name = dept_dictionary[parts[0]]
-            dept_subcode = parts[1]
-            if dept_subcode.isdigit():
-                subunit = central_department
-            else:
-                # Handle UKTI's various subunits as a special case.
-                if dept_subcode == "UKTI":
-                    dept_subcode = parts[1] + "-" + parts[2]
-                subunit = dept_dictionary[dept_subcode]
-        except:
-            raise RuntimeError('Filename %s not mapped in departmental names' %
-                               filename)
-        return dept_name, subunit
-
-    # filenames like "probation-trusts-data-dec-10.csv" where the first
-    # secment is the department
-    parts = filename.split('%20')
-    if len(parts) == 1:
-        parts = filename.split('-')
-    if len(parts) == 1:
-        parts = filename.split('_')
-
-    department_code = parts[0].lower()
-    for (code, department_name) in dept_dictionary.items():
-        if department_code == code.lower():
-            return (department_name, central_department)
-
-    return (None, None)
-
-def make_loader():
-    loader = Loader(DATASET_NAME, ['name'], dataset_label,
-            description=dataset_description, currency=dataset_currency)
-    return loader
-
-def load(*args):
-    path = os.path.join(config['getdata_cache'], 'departments')
-    if not os.path.exists(path):
-        assert len(args) >= 1, 'You need to supply a url to retrieve from' + \
-            ' c.f. http://ckan.net/package/ukgov-25k-spending'
-        retrieve(args[0], path)
-    total = 0
-    spending_files = sorted(glob.glob(os.path.join(path, '*.csv')))
-    # If dataset does not already exist...
-    # Make a suitably configured Loader (this also creates the Dataset).
-    #if not model.Dataset.find({ 'name': DATASET_NAME}):
-    loader = make_loader()
-
-    status = []
-    for spending_file in spending_files:
-        starttime = time.time()
-        if not_duplicate(spending_file, spending_files):
-            dept_name, subunit = get_department(spending_file)
-            if dept_name is None:
-                status.append([spending_file,
-                               'Skipped. Could not find department'])
-                continue
-            filepath = os.path.join(path, spending_file)
-            try:
-                count = load_file(loader, unicode(filepath),
-                                  unicode(dept_name), unicode(subunit))
-            except Exception, E:
-                status.append([spending_file, "Exception %s" % str(E)])
-                continue
-            total = total + count
-            elapsed = time.time() - starttime
-            status.append([spending_file,
-                           "%s entries /%0.2f seconds" % (count, elapsed)])
-            continue
-        status.append([spending_file, 'Skipped'])
-
-    for (spending_file, info) in status:
-        print "%s: %s" % (os.path.basename(spending_file), info)
-    print "Total: %s" % total
-    aggregate(loader)
-
-
-def aggregate(loader=None):
-    if loader is None:
-        loader = make_loader()
-    loader.compute_aggregates()
-    pass
-
-
-def retrieve(index_url, basepath):
-    '''
-    Get all CSV spending files.
-    '''
-    from BeautifulSoup import BeautifulSoup, SoupStrainer
-    if not os.path.exists(basepath):
-        os.makedirs(basepath)
-
-    page = urllib2.urlopen(index_url)
-
-    for link in BeautifulSoup(page, parseOnlyThese=SoupStrainer('a')):
-        if link.has_key('href') and link['href'][-4:]==".csv":
-            url = link['href']
-            try:
-                f = urllib2.urlopen(url)
-                file_name = url.split("/")[-1]
-                file_path = os.path.join(basepath, file_name)
-                print "Downloading file %s to %s" % (file_name, file_path)
-                local_file = open(file_path, "wb")
-                try:
-                    local_file.write(f.read())
-                finally:
-                    local_file.close()
-            #handle errors
-            except urllib2.HTTPError, e:
-                print "HTTP Error:",e.code, url
-            except urllib2.URLError, e:
-                print "URL Error:", e.reason, url

wdmmgext/load/fts.py

-# -*- coding: utf-8 -*-
-# FTS Reporting system loader
-
-import os
-from cProfile import runctx
-from lxml import etree
-from wdmmg.model import Dataset, Entry, Entity, Classifier
-from wdmmg.lib.loader import Loader
-from wdmmg.lib.munge import to_name
-from wdmmg.lib.times import for_year
-
-from pylons import config
-
-import eu
-
-# OPEN ISSUES
-#
-# * Find and apply EC budget taxonomy
-# * Consortium destination entities
-
-DATASET_NAME = u'fts'
-TAXONOMY = 'ec'
-
-FILES = ["export_2009_en.xml", "export_2008_en.xml", "export_2007_en.xml"]
-
-EC_BUDGET_PREFIX = "4-section-3-"
-
-profile_ = config.get('profile', '').lower() in ['true', '1']
-profile_limit = int(config.get('profile_loader_limit', 0))
-
-
-def num_to_float(num):
-    if num is None:
-        return None
-    num = num.replace('%', '').strip().replace('.', '').replace(',', '.')
-    try:
-        return float(num)
-    except:
-        None
-
-
-def chunks(l, n):
-    """ Yield successive n-sized chunks from l.
-    """
-    for i in xrange(0, len(l), n):
-        yield l[i:i + n]
-
-
-def load_entries(loader, fn):
-    doc = etree.parse(fn)
-    commitments = doc.findall('//commitment')
-    if profile_ and profile_limit:
-        commitments = commitments[:profile_limit]
-
-    commitment_groups = chunks(commitments, 1000)
-
-    def load_entry(commitment):
-        base_entry = {}
-        base_entry['time'] = for_year((commitment.findtext('year')))
-        base_entry['total'] = num_to_float(commitment.findtext('amount'))
-        base_entry['cofinancing_rate'] = commitment.findtext(
-            'cofinancing_rate')
-        base_entry['cofinancing_rate_pct'] = num_to_float(
-            base_entry['cofinancing_rate'])
-        base_entry['position_key'] = \
-                unicode(commitment.findtext('position_key'))
-        base_entry['grant_subject'] = commitment.findtext('grant_subject')
-        base_entry['source_file'] = fn
-        base_entry['actiontype'] = commitment.findtext('actiontype')
-        budget_line = commitment.findtext('budget_line')
-
-        name, code = budget_line.rsplit('(', 1)
-        code = code.replace(')', '').replace('"', '').strip()
-        base_entry['budget_item'] = name.strip()
-        base_entry['budget_code'] = code
-
-        #conv = lambda cp: cp if cp=='XX' else int(cp)
-        parts = code.split('.')
-        part_prefix = []
-        for part in parts:
-            part_prefix.append(part)
-            cf_id = EC_BUDGET_PREFIX + "-".join(map(str, part_prefix))
-            cf_id = cf_id.lower()
-            classifier = loader.get_classifier(cf_id, eu.TAXONOMY)
-            dimension = {
-                1: "title",
-                2: "chapter",
-                3: "article",
-                4: "item",
-                5: "subitem"
-            }.get(len(part_prefix))
-            if dimension and classifier:
-                loader.classify_entry(base_entry, classifier, dimension)
-
-        dep_label = commitment.findtext('responsible_department')
-        dep_name = "ec-" + to_name(unicode(dep_label))
-        from_ = loader.create_entity(dep_name, label=dep_label,
-                fts_department=True)
-        base_entry['from'] = from_
-
-        for beneficiary in commitment.findall('.//beneficiary'):
-            to = {}
-            entry = base_entry.copy()
-            #print dir(base_entry)
-            #entry['line_no'] = base_entry.sourceline
-            to['fts_beneficiary'] = True
-            to['label'] = beneficiary.findtext('name')
-            if '*' in to['label']:
-                to['label'], to['alias'] = to['label'].split('*', 1)
-            to['address'] = beneficiary.findtext('address')
-            to['city'] = beneficiary.findtext('city')
-            to['post_code'] = beneficiary.findtext('post_code')
-            to['country'] = beneficiary.findtext('country')
-
-            if to['country'] in eu.EU27:
-                entry['country'] = eu.to_country(loader, to['country'])
-
-            to['geozone'] = beneficiary.findtext('geozone')
-            to['coordinator'] = beneficiary.findtext('coordinator') == "1"
-            detail_amount = commitment.findtext('detail_amount')
-            if detail_amount is None or not len(detail_amount):
-                entry['amount'] = num_to_float(
-                    commitment.findtext('detail_amount'))
-            if entry['amount'] is None:
-                entry['amount'] = base_entry['total']
-            to_entity = loader.create_entity(to_name(unicode(to['label'])),
-                                             **to)
-            entry['to'] = to_entity
-            entry['name'] = 'fts-%s-%s-%s-%s' % (
-                to_name(entry.get('position_key')),
-                dep_name, entry.get('to').get('name'),
-                entry.get('year'))
-            loader.create_entry(**entry)
-
-    def load_group(group):
-        '''helper function so we can profile chunks of entries'''
-        for entry in group:
-            load_entry(entry)
-
-    group_counter = 0
-    for group in commitment_groups:
-        if profile_:
-            filename = "fts-load-group-%s.stats" % group_counter
-            runctx('load_group(group)', locals(), globals(), filename)
-            group_counter = group_counter + 1
-        else:
-            load_group(group)
-
-
-def make_loader():
-    loader = Loader(DATASET_NAME, ['name'],
-                    "EC Financial Transparency System", currency='EUR', 
-                    description=(u'Financial commitments by the European'
-                                 u'Commission'))
-    return loader
-
-
-def aggregate(loader=None):
-    if loader is None:
-        if profile_:
-            runctx('make_loader', globals(), locals(), 'fts-make-loader.stats')
-        else:
-            loader = make_loader()
-
-    loader.create_view(Dataset, {'name': DATASET_NAME},
-                       name='default',
-                       label="Commitments per Directorate-General",
-                       dimension="dataset", breakdown="from",
-                       view_filters={})
-    loader.create_view(Dataset, {'name': DATASET_NAME},
-                       name='actiontype', label="Commitments per Action Type",
-                       dimension="dataset", breakdown="actiontype",
-                       view_filters={})
-    loader.create_view(Entity, {'fts_department': True},
-                       name='default', label="Commitments per Budget Article",
-                       dimension="from", breakdown="article",
-                       view_filters={})
-    loader.create_view(Entity, {'fts_department': True},
-                       name='actiontype', label="Commitments per Action Type",
-                       dimension="from", breakdown="actiontype",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'article'},
-                       name='fts_beneficiary',
-                       label="FTS Beneficiaries of Commitments",
-                       dimension="article", breakdown="to",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'item'},
-                       name='fts_beneficiary',
-                       label="FTS Beneficiaries of Commitments",
-                       dimension="item", breakdown="to",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'subitem'},
-                       name='fts_beneficiary',
-                       label="FTS Beneficiaries of Commitments",
-                       dimension="subitem", breakdown="to",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'article'},
-                       name='fts_subjects',
-                       label="FTS Descriptions of Commitments",
-                       dimension="article", breakdown="grant_subject",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'item'},
-                       name='fts_subjects',
-                       label="FTS Descriptions of Commitments",
-                       dimension="item", breakdown="grant_subject",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'subitem'},
-                       name='fts_subjects',
-                       label="FTS Descriptions of Commitments",
-                       dimension="subitem", breakdown="grant_subject",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'article'},
-                       name='fts_ms',
-                       label="FTS Member States of Beneficiaries",
-                       dimension="article", breakdown="country",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'item'},
-                       name='fts_ms',
-                       label="FTS Member States of Beneficiaries",
-                       dimension="item", breakdown="country",
-                       view_filters={})
-    loader.create_view(Classifier, {'taxonomy': 'eu', 'level_name': 'subitem'},
-                       name='fts_ms',
-                       label="FTS Member States of Beneficiaries",
-                       dimension="subitem", breakdown="country",
-                       view_filters={})
-    loader.create_view(Entity, {'fts_beneficiary': True},
-                       name='default',
-                       label="FTS Commitments Received (by Article)",
-                       dimension="to", breakdown="article",
-                       view_filters={})
-
-
-def load():
-    assert Dataset.find_one({"name": eu.DATASET_NAME}), \
-        "EU Budget must be loaded first!"
-
-    loader = make_loader()
-
-    assert loader is not None
-    base_dir = os.path.join(config['getdata_cache'], 'fts')
-
-    def describe_dimension(key, label, description=None):
-        pass
-        #loader.create_dimension(key, label, description=description)
-
-    describe_dimension(
-        u'actiontype', 'Action Type',
-        description=(
-            'Grants and procurements are administered by Commission '
-            'departments called "Directorates General" (DG) in charge of '
-            'implementing policies & EU funded programmes. The name of the '
-            'DG can help you identify a policy area. However, be aware '
-            'that grants and procurements in certain areas are managed by '
-            'more than one DG ( for example "research" grants which are '
-            'handed out not only by DG Research but also DG Information '
-            'Society, DG Enterprise, etc.) and that one DG may manage '
-            'funds in more than one area.'))
-    describe_dimension(u'budget_code', 'Budget (Item Code)')
-    describe_dimension(u'budget_item', 'Budget (Item Name)')
-    describe_dimension(
-        u'cofinancing_rate', 'Co-financing rate (original)',
-        description=(
-            'The co-financing rate of a project total financing '
-            'corresponds to the portion (expressed in %) financed '
-            'by the EU. Alongside EU financing, many projects may/must '
-            'receive national, private or international financing. '
-            'Grants (see article 108a of the Financial Regulation) may '
-            'take the form of a reimbursement of a specified proportion '
-            'of the eligible costs actually incurred (FTS shows then a '
-            'co-financing rate), of a lump sum financing, of a flat-rate '
-            'financing or the form of a combination of the three forms '
-            'together, so called "mixed financing". The programme is set '
-            'up on well defined objectives and reflects one or several '
-            'EU policies. It acts as general framework for the '
-            'implementation of specific projects or actions. The address '
-            'is based on the information given by the beneficiaries in '
-            'the identification documents submitted to the Commission.'))
-    describe_dimension(u'cofinancing_rate_pct', 'Co-financing rate (as percentage)')
-    describe_dimension(u'grant_subject', 'Grant Subject',
-                 description=(
-            'The subject of a grant or procurement provides general '
-            'information on the nature and purpose of the expenditure, '
-            'when available in the system.'))
-    describe_dimension(u'position_key', 'Position Key')
-    describe_dimension(u'source_file', 'Source File')
-    describe_dimension(u'total', 'Grant Total Value')
-
-    for file_name in FILES:
-        export_file_name = os.path.join(base_dir, file_name)
-        #load_entries(loader, export_file_name)
-
-    #aggregate(loader)
-    loader.compute_aggregates()
-    #country_file_name = os.path.join(base_dir, "countries.csv")
-    #countries = load_countries(country_file_name)
-
-    #stat_file_name = os.path.join(base_dir, "teilm020.tsv")
-    #load_unemployment(stat_file_name, countries)
-
-    #stat_file_name = os.path.join(base_dir, "teiis080.tsv")
-    #load_industrial_production(stat_file_name, countries)

wdmmgext/load/fts_aux.py

-
-def load_tsv(fn):
-    fh = open(fn, 'rb')
-    reader = csv.reader(fh, delimiter='\t')
-    header = reader.next()
-    #print header
-    desc, columns = header[0], header[1:]
-    rows, column_dim = desc.split('\\')
-    row_dims = rows.split(',')
-    cells = []
-    for row in reader:
-        row_header = row[0].split(',')
-        row_data = dict(zip(row_dims, row_header))
-        for i, column in enumerate(row[1:]):
-            cell = row_data.copy()
-            cell[column_dim] = columns[i].strip()
-            try:
-                cell['measure'] = float(column)
-            except ValueError, ve: 
-                cell['measure'] = None
-            cells.append(cell)
-    fh.close()
-    return cells
-
-
-def set_indicator(time, geo, key, value):
-    print "Set Eurostat indicator: ", time, geo, key, value
-    #print Entry.find({'time': time, 'to.country': geo}).count()
-    Entry.c.update({'time': time, 'to.country_code': geo},
-                   {'$set': {key: value}}) 
-
-
-def load_countries(fn):
-   fh = open(fn, 'rb')
-   for row in csv.reader(fh, delimiter=',', quotechar='"'):
-       name, _, code, _, _ = row
-       #print name, code
-       Entry.c.update({'to.country': name},
-                      {'$set': {'to.country_code': code}})
-       Entity.c.update({'country': name},
-                       {'$set': {'country_code': code}})
-    
-
-
-def load_statistic(fn, key, label, time_suffix='M12', filters={}):
-    times = dict([(str(t) + time_suffix, t) for t in Entry.c.distinct('time')])
-    def _check_filters(cell):
-        pass  # for 
-    for cell in load_tsv(fn):
-        
-        if not cell.get('sex') == 'T': 
-            continue
-        update_time = times.get(cell.get('time'))
-        if not update_time: continue 
-        update_geo = cell.get('geo')
-        Entry.describe_key(key, label, DATASET_NAME)
-        set_indicator(update_time, update_geo, key, cell.get('measure'))
-
-
-def load_unemployment(fn, countries, key='teilm020', 
-    label='Employment - [teina300]; Percentage change q/q-1 (SA)', time_suffix='M12'):
-    cells = load_tsv(fn)
-    times = dict([(str(t) + time_suffix, t) for t in Entry.c.distinct('time')])
-    for cell in cells:
-        if not cell.get('sex') == 'T': 
-            continue
-        update_time = times.get(cell.get('time'))
-        if not update_time: continue 
-        update_geo = cell.get('geo')
-        Entry.describe_key(key, label, DATASET_NAME)
-        set_indicator(update_time, update_geo, key, cell.get('measure'))
-
-
-def load_industrial_production(fn, countries, key='teiis080', 
-    label='Industrial production - total industry (excluding construction) - [teiis080]; Percentage change m/m-1 (SA)', time_suffix='M12'):
-    cells = load_tsv(fn)
-    times = dict([(str(t) + time_suffix, t) for t in Entry.c.distinct('time')])
-    for cell in cells:
-        #if not cell.get('unit') == 'PCH_Q1_SA': 
-        #    continue
-        update_time = times.get(cell.get('time'))
-        if not update_time: continue
-        update_geo = cell.get('geo')
-        Entry.describe_key(key, label, DATASET_NAME)
-        set_indicator(update_time, update_geo, key, cell.get('measure'))
-
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.