Commits

Rufus Pollock committed 7fd1894

[data][s]: refactor data.py into a class structure and add a couple of basic tests.

  • Participants
  • Parent commits 6dce5c7

Comments (0)

Files changed (2)

File eurostat/data.py

 from swiss.misc import floatify
 base = 'http://epp.eurostat.ec.europa.eu/NavTree_prod/everybody/BulkDownloadListing?file=data/'
 
-ourcache = cache.Cache('static/cache')
+cachepath = os.path.join(os.path.dirname(__file__), 'static', 'cache')
+ourcache = cache.Cache(cachepath)
 
-def download(dataset_id):
-    '''Download a eurostat dataset based on its `dataset_id`
-    '''
-    fn = dataset_id + '.tsv.gz'
-    url = base + fn
-    # do not use retrieve as we get random ugly name
-    fp = ourcache.cache_path(fn)
-    ourcache.download(url, fp)
-    newfp = fp[:-3]
-    contents = gzip.GzipFile(fp).read()
-    open(newfp, 'w').write(contents)
-    return newfp
+class Data(object):
+    def download(self, dataset_id):
+        '''Download a eurostat dataset based on its `dataset_id`
+        '''
+        fn = dataset_id + '.tsv.gz'
+        url = base + fn
+        # do not use retrieve as we get random ugly name
+        fp = ourcache.cache_path(fn)
+        if not os.path.exists(fp):
+            ourcache.download(url, fp)
+        newfp = fp[:-3]
+        contents = gzip.GzipFile(fp).read()
+        open(newfp, 'w').write(contents)
+        return newfp
 
-def extract(newfp):
-    '''Extract data from tsv file at `filepath`, clean it and save it as json
-    to file with same basename and extension json'''
-    reader = tabular.CsvReader()
-    tab = reader.read(open(newfp), dialect='excel-tab')
-    # some data has blank top row!
-    if not tab.header:
-        tab.header = tab.data[0]
-        del tab.data[0]
-    alldata = [tab.header] + tab.data
-    transposed = zip(*alldata)
-    tab.header = transposed[0]
-    def parsedate(cell):
-        if 'Q' in cell:
-            items = cell.split('Q')
-            return float(items[0]) + 0.25 * (int(items[1]) - 1)
-        elif 'M' in cell:
-            items = cell.split('M')
-            return float(items[0]) + 1/12.0 * (int(items[1]) - 1)
-    def cleanrow(row):
-        newrow = [ x.strip() for x in row ]
-        newrow = [parsedate(newrow[0])] + [ floatify(x) for x in newrow[1:] ]
-        return newrow
-    tab.data = map(cleanrow, transposed[1:])
-    writer = tabular.JsonWriter()
-    jsonfp = newfp.split('.')[0] + '.json'
-    writer.write(tab, open(jsonfp, 'w'))
+    def extract(self, newfp):
+        '''Extract data from tsv file at `filepath`, clean it and save it as json
+        to file with same basename and extension json'''
+        reader = tabular.CsvReader()
+        tab = reader.read(open(newfp), dialect='excel-tab')
+        # some data has blank top row!
+        if not tab.header:
+            tab.header = tab.data[0]
+            del tab.data[0]
+        alldata = [tab.header] + tab.data
+        transposed = zip(*alldata)
+        tab.header = transposed[0]
+        def parsedate(cell):
+            if 'Q' in cell:
+                items = cell.split('Q')
+                return float(items[0]) + 0.25 * (int(items[1]) - 1)
+            elif 'M' in cell:
+                items = cell.split('M')
+                return float(items[0]) + 1/12.0 * (int(items[1]) - 1)
+        def cleanrow(row):
+            newrow = [ x.strip() for x in row ]
+            newrow = [parsedate(newrow[0])] + [ floatify(x) for x in newrow[1:] ]
+            return newrow
+        tab.data = map(cleanrow, transposed[1:])
+        writer = tabular.JsonWriter()
+        jsonfp = newfp.split('.')[0] + '.json'
+        writer.write(tab, open(jsonfp, 'w'))
 
-PEEI_LIST = 'peeis.json'
-def peeis():
-    '''Scrape the Eurostat Prinicipal Economic Indicators (PEEI) list'''
-    # turns out they iframe the data!
-    # url = 'http://epp.eurostat.ec.europa.eu/portal/page/portal/euroindicators/peeis/'
-    url = 'http://epp.eurostat.ec.europa.eu/cache/PEEIs/PEEIs_EN.html'
-    fp = ourcache.retrieve(url)
-    html = open(fp).read()
-    tdataset_ids = re.findall(r'goToTGM.*pcode=([^&]+)&', html)
-    # de-dup
-    dataset_ids = []
-    for _id in tdataset_ids:
-        if _id not in dataset_ids:
-            dataset_ids.append(_id)
-    reader = tabular.HtmlReader()
-    tab = reader.read(fp, 1)
-    peeis = []
-    for row in tab.data[3:]:
-        series_name = row[1].strip()
-        if series_name.startswith('3 month') or series_name[0] not in '%0123456789':
-            peeis.append(series_name)
-        if series_name == 'Euro-dollar exchange rate':
-            break
-    peeis_index = {}
-    for count, (_id, title) in enumerate(zip(dataset_ids, peeis)):
-        peeis_index[_id] = {
-            'title': title,
-            'order': count
-            }
-    print len(dataset_ids), dataset_ids
-    print len(peeis_index)
-    dumppath = ourcache.cache_path(PEEI_LIST)
-    json.dump(peeis_index, open(dumppath, 'w'), indent=2)
-    print 'PEEIs extracted to %s' % dumppath
+    PEEI_LIST = 'peeis.json'
+    def peeis(self):
+        '''Scrape the Eurostat Prinicipal Economic Indicators (PEEI) list'''
+        # turns out they iframe the data!
+        # url = 'http://epp.eurostat.ec.europa.eu/portal/page/portal/euroindicators/peeis/'
+        url = 'http://epp.eurostat.ec.europa.eu/cache/PEEIs/PEEIs_EN.html'
+        fp = ourcache.retrieve(url)
+        html = open(fp).read()
+        tdataset_ids = re.findall(r'goToTGM.*pcode=([^&]+)&', html)
+        # de-dup
+        dataset_ids = []
+        for _id in tdataset_ids:
+            if _id not in dataset_ids:
+                dataset_ids.append(_id)
+        reader = tabular.HtmlReader()
+        tab = reader.read(fp, 1)
+        peei_titles = []
+        for row in tab.data[3:]:
+            series_name = row[1].strip()
+            if series_name.startswith('3 month') or series_name[0] not in '%0123456789':
+                peei_titles.append(series_name)
+            if series_name == 'Euro-dollar exchange rate':
+                break
+        peeis = {}
+        for count, (_id, title) in enumerate(zip(dataset_ids, peei_titles)):
+            peeis[_id] = {
+                'title': title,
+                'order': count
+                }
+        dumppath = ourcache.cache_path(self.PEEI_LIST)
+        json.dump(peeis, open(dumppath, 'w'), indent=2)
+        print 'PEEIs extracted to %s' % dumppath
+        return peeis
 
-def peeis_download():
-    '''Download (and extract to json) all PEEI datasets.'''
-    peei_list_fp = ourcache.cache_path(PEEI_LIST)
-    peeis = json.load(open(peei_list_fp))
-    for eurostatid in sorted(peeis.keys()):
-        print 'Processing: %s' % eurostatid
-        fp = download(eurostatid)
-        extract(fp)
+    def peeis_download(self):
+        '''Download (and extract to json) all PEEI datasets.'''
+        peei_list_fp = ourcache.cache_path(self.PEEI_LIST)
+        peeis = json.load(open(peei_list_fp))
+        for eurostatid in sorted(peeis.keys()):
+            print 'Processing: %s' % eurostatid
+            fp = self.download(eurostatid)
+            self.extract(fp)
     
 from swiss.clitools import _main
 if __name__ == '__main__':
-    _main(locals())
+    _main(Data)
 

File eurostat/test/test_data.py

+from eurostat.data import Data
+
+class TestData:
+    def test_peeis(self):
+        d = Data()
+        out = d.peeis()
+        assert len(out) == 22
+
+    def test_peeis_download(self):
+        d = Data()
+        d.peeis_download()
+