Commits

Chris Mutel  committed 604f53f

0.12. Change algorithm to create filenames to prevent illegal characters, better testing, and small bug fixes.

  • Participants
  • Parent commits 9847613
  • Tags 0.12

Comments (0)

Files changed (16)

 Changelog
 *********
 
-0.11 ()
-=======
+0.12 (2014-02-04)
+=================
+
+**bw2-uptodate.py is required for this update**
+
+Safe filenames
+--------------
+
+The algorithm to create filenames was changed to prevent illegal characters being used. See `utils.safe_filename`.
+
+0.11 (2014-01-28)
+=================
 
 **bw2-uptodate.py is required for this update**
 

File bw2data/__init__.py

 # -*- coding: utf-8 -*
-__version__ = (0, 11, "rc2")
+__version__ = (0, 12)
 
 from ._config import config
 from .meta import databases, methods, mapping, reset_meta, geomapping, \

File bw2data/data_store.py

 # -*- coding: utf-8 -*
+from . import config
 from .errors import UnknownObject
-from . import config
+from .utils import safe_filename
 import numpy as np
 import os
 import warnings
     @property
     def filename(self):
         """Can be overwritten in cases where the filename is not the name"""
-        return self.name
+        return safe_filename(self.name)
 
     def register(self, **kwargs):
         """Register an object with the metadata store.
         """Validate data. Must be called manually.
 
         Need some metaprogramming because class methods have `self` injected automatically."""
-        self.validator.__func__(data)
+        self.validator(data)
         return True
 
     def backup(self):

File bw2data/database.py

 from .query import Query
 from .data_store import DataStore
 from .units import normalize_units
-from .utils import natural_sort, MAX_INT_32, TYPE_DICTIONARY
+from .utils import natural_sort, MAX_INT_32, TYPE_DICTIONARY, safe_filename
 from .validate import db_validator
 from time import time
 import datetime
 
     """
     metadata = databases
-    valdiator = db_validator
+    validator = db_validator
     dtype_fields = [
         ('input', np.uint32),
         ('output', np.uint32),
             Filename (not path)
 
         """
-        return "%s.%i" % (
-            self.name,
+        return u"%s.%i" % (
+            safe_filename(self.name),
             version or self.version
         )
 
         """
         directory = os.path.join(config.dir, "intermediate")
         files = natural_sort(filter(
-            lambda x: ".".join(x.split(".")[:-2]) == self.name,
+            lambda x: ".".join(x.split(".")[:-2]) == safe_filename(self.name),
             os.listdir(directory)))
         return sorted([(int(name.split(".")[-2]),
             datetime.datetime.fromtimestamp(os.stat(os.path.join(

File bw2data/ia_data_store.py

 # -*- coding: utf-8 -*-
-from . import config
 from .data_store import DataStore
-from copy import copy
-from errors import UnknownObject, MissingIntermediateData
-from utils import random_string
+from .utils import safe_filename
 import hashlib
-import os
 import string
-import warnings
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
 
 
 def abbreviate(names, length=8):
+    safe_names = [safe_filename(x, False) for x in names]
     abbrev = lambda x: x if x[0] in string.digits else x[0].lower()
-    name = u" ".join(names).split(" ")[0].lower() + \
-        u"".join([abbrev(x) for x in u" ".join(names).split(" ")[1:]])
-    return name + u"-" + hashlib.md5(unicode(u"-".join(names))).hexdigest()
+    name = u" ".join(safe_names).split(" ")[0].lower() + \
+        u"".join([abbrev(x) for x in u" ".join(safe_names).split(" ")[1:]])
+    return name + u"." + hashlib.md5(unicode(u"-".join(names))).hexdigest()
 
 
 class ImpactAssessmentDataStore(DataStore):

File bw2data/tests/__init__.py

 from .database import DatabaseTest
 from .data_store import DataStoreTestCase
 from .geo import GeoTest
-from .ia import IADSTest, MethodTest, WeightingTest
+from .ia import IADSTest, MethodTest, WeightingTest, NormalizationTest
 from .simapro import SimaProImportTest
 from .sparse import SparseMatrixProxyTest
 from .utils import UtilsTest

File bw2data/tests/data_store.py

 # -*- coding: utf-8 -*-
 from . import BW2DataTest
-from .. import config, Database, mapping
+from .. import config
 from ..data_store import DataStore
+from ..errors import UnknownObject
 from ..serialization import SerializedDict
-from ..errors import UnknownObject
-import hashlib
+from voluptuous import Schema
 import os
 try:
     import cPickle as pickle
 class MockDS(DataStore):
     """Mock DataStore for testing"""
     metadata = metadata
-    validator = lambda x: True
+    validator = Schema(int)
     dtype_fields = []
 
     def process_data(self, row):
-        return (), 0
+        return (), row
 
 
 class DataStoreTestCase(BW2DataTest):
 
     def test_validation(self):
         d = MockDS("cat")
-        self.assertTrue(d.validate("dog"))
+        self.assertTrue(d.validate(4))
 
     def test_processed_array(self):
         d = MockDS("happy")
         d.register()
-        d.write([])
+        d.write([{'amount': 42, 'uncertainty type': 7}])
         d.process()
         fp = os.path.join(config.dir, u"processed", d.filename + u".pickle")
         array = pickle.load(open(fp, "rb"))
 
         fieldnames = {x[0] for x in d.base_uncertainty_fields}
-        self.assertFalse(fieldnames.difference(set(array.dtype.names)))
+        self.assertEqual(fieldnames, set(array.dtype.names))
+        self.assertEqual(array.shape, (1,))
+        self.assertEqual(array[0]['uncertainty_type'], 7)
+        self.assertEqual(array[0]['amount'], 42)

File bw2data/tests/database.py

 # -*- coding: utf-8 -*-
 from . import BW2DataTest
-from .. import Database, databases, mapping, geomapping, config
+from .. import config
+from ..database import Database
 from ..errors import UnknownObject
+from ..meta import mapping, geomapping, databases
+from ..validate import db_validator
 from .fixtures import food, biosphere
 import copy
 import os
         with self.assertRaises(AssertionError):
             d.revert(10)
 
+    def test_versions(self):
+        d = Database("biosphere")
+        d.register(depends=[])
+        d.write(biosphere)
+        self.assertEqual(
+            [x[0] for x in d.versions()], [1]
+        )
+        d.write(biosphere)
+        self.assertEqual(
+            [x[0] for x in d.versions()], [1, 2]
+        )
+
     def test_register(self):
         database = Database("testy")
         database.register()
     def test_processed_array(self):
         database = Database("a database")
         database.register()
-        database.write({})
+        database.write({("a database", 2): {
+            'type': 'process',
+            'exchanges': [{
+                'input': ("a database", 2),
+                'amount': 42,
+                'uncertainty type': 7,
+                'type': 'production'
+            }]
+        }})
         database.process()
         fp = os.path.join(
             config.dir,
         array = pickle.load(open(fp, "rb"))
         fieldnames = {'input', 'output', 'row', 'col', 'type'}
         self.assertFalse(fieldnames.difference(set(array.dtype.names)))
+        self.assertEqual(array.shape, (1,))
+        self.assertEqual(array[0]['uncertainty_type'], 7)
+        self.assertEqual(array[0]['amount'], 42)
 
+    def test_validator(self):
+        database = Database("a database")
+        self.assertTrue(database.validate({}))
+
+    def test_base_class(self):
+        database = Database("a database")
+        self.assertEqual(database.validator, db_validator)
+        self.assertEqual(database.metadata, databases)
+        self.assertEqual(
+            [x[0] for x in database.dtype_fields],
+            ['input', 'output', 'row', 'col', 'type']
+        )
+        self.assertEqual(
+            [x[0] for x in database.dtype_fields],
+            ['input', 'output', 'row', 'col', 'type']
+        )
+

File bw2data/tests/ia.py

 # -*- coding: utf-8 -*-
 from . import BW2DataTest
-from .. import config, Database, Method, mapping, geomapping, Weighting
+from .. import config
+from ..database import Database
 from ..ia_data_store import abbreviate, ImpactAssessmentDataStore as IADS
+from ..meta import mapping, geomapping, weightings, normalizations, methods
+from ..method import Method
 from ..serialization import CompoundJSONDict
+from ..validate import weighting_validator, normalization_validator, ia_validator
+from ..weighting_normalization import Normalization, Weighting
 import hashlib
 import os
 try:
     def test_processed_array(self):
         method = Method(("a", "method"))
         method.register()
-        method.write([])
+        method.write([[("foo", "bar"), 42]])
         method.process()
         fp = os.path.join(config.dir, u"processed", method.filename + u".pickle")
         array = pickle.load(open(fp, "rb"))
 
-        fieldnames = {'flow', 'geo', 'row', 'col'}
-        self.assertFalse(fieldnames.difference(set(array.dtype.names)))
+        fieldnames = {x[0] for x in method.base_uncertainty_fields}.union({'flow', 'geo', 'row', 'col'})
+        self.assertEqual(fieldnames, set(array.dtype.names))
+        self.assertEqual(array[0]['amount'], 42)
+
+    def test_base_class(self):
+        method = Method(("a", "method"))
+        self.assertEqual(method.validator, ia_validator)
+        self.assertEqual(method.metadata, methods)
+        self.assertEqual([x[0] for x in method.dtype_fields], ['flow', 'geo', 'row', 'col'])
+
+    def test_validator(self):
+        method = Method(("a", "method"))
+        self.assertTrue(method.validate([]))
 
 
 class WeightingTest(BW2DataTest):
             w.write([2, 4])
 
     def test_process(self):
-        w = Weighting(("foo",))
-        w.register()
-        w.write([2])
-        w.process()
+        weighting = Weighting(("foo",))
+        weighting.register()
+        weighting.write([42])
+        weighting.process()
+
+        fp = os.path.join(config.dir, u"processed", weighting.filename + u".pickle")
+        array = pickle.load(open(fp, "rb"))
+
+        fieldnames = {x[0] for x in weighting.base_uncertainty_fields}
+        self.assertEqual(fieldnames, set(array.dtype.names))
+        self.assertEqual(array[0]['amount'], 42)
+
+    def test_base_class(self):
+        weighting = Weighting(("foo",))
+        self.assertEqual(weighting.validator, weighting_validator)
+        self.assertEqual(weighting.metadata, weightings)
+        self.assertEqual(weighting.dtype_fields, [])
+
+    def test_validator(self):
+        weighting = Weighting(("foo",))
+        self.assertTrue(weighting.validate([{'amount': 1}]))
+
+
+class NormalizationTest(BW2DataTest):
+    def test_base_class(self):
+        norm = Normalization(("foo",))
+        self.assertEqual(norm.validator, normalization_validator)
+        self.assertEqual(norm.metadata, normalizations)
+        self.assertEqual([x[0] for x in norm.dtype_fields], ['flow', 'index'])
+
+    def test_add_mappings(self):
+        norm = Normalization(("foo",))
+        norm.register()
+        norm.write([[("foo", "bar"), 42]])
+        self.assertTrue(("foo", "bar") in mapping)
+
+    def test_process_data(self):
+        norm = Normalization(("foo",))
+        norm.register()
+        norm.write([[("foo", "bar"), 42]])
+        norm.process()
+
+        fp = os.path.join(config.dir, u"processed", norm.filename + u".pickle")
+        array = pickle.load(open(fp, "rb"))
+
+        fieldnames = {x[0] for x in norm.base_uncertainty_fields}.union({'flow', 'index'})
+        self.assertEqual(fieldnames, set(array.dtype.names))
+        self.assertEqual(array[0]['amount'], 42)
+        self.assertEqual(array[0]['flow'], mapping[("foo", "bar")])

File bw2data/tests/validation.py

-from voluptuous import *
+from voluptuous import Invalid
 import unittest2
 from ..validate import *
 

File bw2data/updates.py

 # -*- coding: utf-8 -*-
-from . import Database, databases, Method, methods, config
+from . import Database, databases, Method, methods, config, Weighting, \
+    weightings, Normalization, normalizations
 from .colors import Fore, safe_colorama
 from .ia_data_store import abbreviate
 from .units import normalize_units
 from .utils import activity_hash
-import numpy as np
 import progressbar
-import sys
+import os
 import warnings
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 
 UPTODATE_WARNING = Fore.RED + "\n\nYour data needs to be updated." + Fore.RESET \
     + " Please run the following program on the command line:\n\n\t" + \
         '0.10 units restandardization': {
             "method": "units_renormalize",
             "explanation": Fore.GREEN + "0.10 units restandardization:" + Fore.RESET + """\n\tBrightway2 tries to normalize units so that they are consistent from machine to machine, and person to person. For example, ``m2a`` is changed to ``square meter-year``. This update adds more data normalizations, and needs to updates links across databases."""},
-        '0.11 reprocess IA methods': {
-            "method": "reprocess_all_methods",
-            "explanation": Fore.GREEN + "0.11 reprocess IA methods" + Fore.RESET + """\n\t0.11 changed the format for processed IA methods, and the algorithm used to shorten IA method names."""},
+        # '0.11 reprocess IA methods': {
+        #     "method": "reprocess_all_lcia",
+        #     "explanation": Fore.GREEN + "0.11 reprocess IA methods" + Fore.RESET + """\n\t0.11 changed the format for processed IA methods, and the algorithm used to shorten IA method names."""},
+        "0.12 reprocess inventory databases": {
+            'method': "redo_all_databases_0_12",
+            "explanation": Fore.GREEN + "0.12 reprocess inventory databases" + Fore.RESET + "\n\t0.12 changed the algorithm to create filenames based on database and LCIA method names, to make sure they don't contain illegal characters."},
+        "0.12 reprocess IA databases": {
+            "method": "reprocess_all_lcia",
+            "explanation": Fore.GREEN + "0.12 reprocess IA databases" + Fore.RESET + "\n\t0.12 changed the algorithm to create filenames based on database and LCIA method names, to make sure they don't contain illegal characters."},
     }
 
     @staticmethod
         """
         updates = []
 
-        # Remove in 0.12
+        # Remove in 1.0
         if "upgrades" in config.p:
             config.p['updates'] = config.p['upgrades']
             del config.p['upgrades']
         return updates
 
     @staticmethod
-    def reprocess_all_methods():
-        """Change name hashing function from random characters (!?) to MD5 hash. Need to update abbreviations and rewrite all data."""
-        print "Updating all LCIA methods"
+    def redo_all_databases_0_12():
+        def load_data_old_filename(name, version):
+            return pickle.load(open(os.path.join(
+                config.dir,
+                u"intermediate",
+                name + u"." + unicode(version) + u".pickle"
+            ), "rb"))
+
+        print "Updating all LCI databases"
 
         pbar = progressbar.ProgressBar(
             widgets=widgets,
-            maxval=len(methods)
+            maxval=len(databases)
         ).start()
 
-        for index, method_key in enumerate(methods):
-            method = Method(method_key)
-            method_data = method.load()
-            methods[method_key]['abbreviation_old'] = \
-                methods[method_key]['abbreviation']
-            methods[method_key]['abbreviation'] = abbreviate(method_key)
-            methods.flush()
-            method.write(method_data)
-            method.process()
+        for index, name in enumerate(databases):
+            db = Database(name)
+            data = load_data_old_filename(name, db.version)
+            db.write(data)
+            db.process()
+
+            databases[name]['filename'] = db.filename
+            databases.flush()
+
             pbar.update(index)
 
         pbar.finish()
 
     @staticmethod
+    def reprocess_all_lcia():
+        """0.11: Change name hashing function from random characters (!?) to MD5 hash. Need to update abbreviations and rewrite all data.
+
+        0.12: Make sure strings are sanitized to be able to be used in filenames. Need to update abbreviations and rewrite all data."""
+        LCIA = [
+            (methods, Method, "LCIA methods"),
+            (weightings, Weighting, "LCIA weightings"),
+            (normalizations, Normalization, "LCIA normalizations")
+        ]
+
+        for (meta, klass, name) in LCIA:
+            if meta.list:
+                print "Updating all %s" % name
+
+                pbar = progressbar.ProgressBar(
+                    widgets=widgets,
+                    maxval=len(meta)
+                ).start()
+
+                for index, key in enumerate(meta):
+                    obj = klass(key)
+                    data = obj.load()
+                    meta[key]['abbreviation_old'] = \
+                        meta[key]['abbreviation']
+                    meta[key]['abbreviation'] = abbreviate(key)
+                    meta.flush()
+                    obj.write(data)
+                    obj.process()
+                    pbar.update(index)
+
+                pbar.finish()
+
+    @staticmethod
     def units_renormalize():
         """Renormalize some units, making many activity datasets with hash ids change."""
         db_versions = {name: databases[name]['version'] for name in databases.list}

File bw2data/utils.py

 import re
 import requests
 import string
+import unicodedata
 import zipfile
 try:
     import cStringIO as StringIO
 
 DOWNLOAD_URL = "http://brightwaylca.org/data/"
 
+re_slugify = re.compile('[^\w\s-]', re.UNICODE)
+
 
 def natural_sort(l):
     """Sort the given list in the way that humans expect"""
     return method
 
 
+def safe_filename(string, add_hash=True):
+    """Convert arbitrary strings to make them safe for filenames. Substitutes strange characters, and uses unicode normalization. Appends hash of name to avoid collisions.
+
+    From http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python"""
+    hashed = hashlib.md5(string).hexdigest()
+    safe = re.sub(
+        '[-\s]+',
+        '-',
+        unicode(
+            re_slugify.sub(
+                '',
+                unicodedata.normalize('NFKD', unicode(string))
+            ).strip()
+        )
+    )
+    if add_hash:
+        return safe + u"." + hashed
+    else:
+        return safe
+
+
 def combine_databases(name, *dbs):
     """Combine databases into new database called ``name``."""
     pass

File bw2data/validate.py

 
 uncertainty_dict = {
     Required("amount"): Any(float, int),
-    "uncertainty_type": int,
+    "uncertainty type": int,
     "loc": Any(float, int),
     "scale": Any(float, int),
     "shape": Any(float, int),

File bw2data/weighting_normalization.py

 
     """
     metadata = weightings
-    valdiator = weighting_validator
+    validator = weighting_validator
     dtype_fields = []
 
     def write(self, data):
         """Because of DataStore assumptions, need a one-element list"""
-        if not len(data) == 1 or not isinstance(data, list):
+        if not isinstance(data, list) or not len(data) == 1:
             raise ValueError("Weighting data must be one-element list")
         super(Weighting, self).write(data)
 
 
     """
     metadata = normalizations
-    valdiator = normalization_validator
+    validator = normalization_validator
     dtype_fields = [
         ('flow', np.uint32),
         ('index', np.uint32),

File docs/conf.py

 # built documents.
 #
 # The short X.Y version.
-version = '0.11'
+version = '0.12'
 # The full version, including alpha/beta/rc tags.
-release = '0.11'
+release = '0.12'
 
 import sys
 from os.path import abspath, dirname
 
 setup(
     name='bw2data',
-    version="0.11RC2",
+    version="0.12",
     packages=packages,
     author="Chris Mutel",
     author_email="cmutel@gmail.com",