Commits

Chris Mutel  committed a666404

0.9.0-alpha. Everything and more is changed.

  • Participants
  • Parent commits 49a2e22

Comments (0)

Files changed (20)

 Brightway2 data management
 ==========================
 
-This package provides tools for the management of inventory databases and impact assessment methods. It is part of the `Brightway2 LCA framework <http://brightwaylca.org>`_. `Online documentation <http://bw2data.readthedocs.org>`_ is available, and the source code is hosted on `Bitucket <https://bitbucket.org/cmutel/bw2data>`_.
+This package provides tools for the management of inventory databases and impact assessment methods. It is part of the `Brightway2 LCA framework <http://brightwaylca.org>`_. `Online documentation <http://bw2data.readthedocs.org>`_ is available, and the source code is hosted on `Bitucket <https://bitbucket.org/cmutel/brightway2-data>`_.

File bw2data/_config.py

         To set the environment variable:
 
         * Unix/Mac: ``export BRIGHTWAY2_DIR=/path/to/brightway2/directory``
-        * Windows XP: Create a ``brightway2path.txt`` file in your home directory (C:\Users\Your Name\) with a single line that is the directory path you want to use.
+        * Windows XP: Instead of an environment variable, just create a ``brightway2path.txt`` file in your home directory (C:\Users\Your Name\) with a single line that is the directory path you want to use.
         * Windows 7: ``setx BRIGHTWAY2_DIR=\path\to\brightway2\directory``
 
         """

File bw2data/database.py

 from errors import MissingIntermediateData, UnknownObject
 from query import Query
 from time import time
-from utils import natural_sort, MAX_INT_32
+from units import normalize_units
+from utils import natural_sort, MAX_INT_32, TYPE_DICTIONARY
 from validate import db_validator
 import datetime
 import numpy as np
             depends=databases[self.database]["depends"],
             num_processes=len(data))
         new_database.write(data)
+        return new_database
 
     def backup(self):
         """Save a backup to ``backups`` folder.
             config.cache[self.database] = self.load(version)
         self.process(version)
 
-    def register(self, format, depends, num_processes):
+    def register(self, format, depends, num_processes, version=None):
         """Register a database with the metadata store.
 
         Databases must be registered before data can be written.
             "from format": format,
             "depends": depends,
             "number": num_processes,
-            "version": 0
+            "version": version or 0
             }
 
     def deregister(self):
             raise UnknownObject("This database is not yet registered")
         databases.increment_version(self.database, len(data))
         mapping.add(data.keys())
+        for ds in data.values():
+            ds["unit"] = normalize_units(ds["unit"])
         geomapping.add([x["location"] for x in data.values() if \
             x.get("location", False)])
         if config.p.get("use_cache", False) and self.database in config.cache:
             ('geo', np.uint32),
             ('row', np.uint32),
             ('col', np.uint32),
-            ('technosphere', np.bool),
+            ('type', np.uint8),
             ('amount', np.float32),
             ('sigma', np.float32),
             ('minimum', np.float32),
             ('maximum', np.float32),
             ('negative', np.bool)]
-        arr = np.zeros((num_exchanges, ), dtype=dtype)
+        arr = np.zeros((num_exchanges + len(data), ), dtype=dtype)
         arr['minimum'] = arr['maximum'] = arr['sigma'] = np.NaN
         count = 0
-        for key in data:
-            for exc in data[key]["exchanges"]:
+        for key in sorted(data.keys(), key=lambda x: x[1]):
+            production_found = False
+            for exc in sorted(data[key]["exchanges"],
+                    key=lambda x: x["input"][1]):
+                if key == exc["input"]:
+                    production_found = True
                 arr[count] = (
                     exc["uncertainty type"],
                     mapping[exc["input"]],
                     mapping[key],
-                    geomapping[data[key].get("location", "GLO")],
+                    geomapping[data[key].get("location", "GLO") or "GLO"],
                     MAX_INT_32,
                     MAX_INT_32,
-                    exc["technosphere"],
+                    TYPE_DICTIONARY[exc["type"]],
                     exc["amount"],
                     exc.get("sigma", np.NaN),
                     exc.get("minimum", np.NaN),
                     exc["amount"] < 1
                     )
                 count += 1
+            if not production_found and data[key]["type"] == "process":
+                # Add amount produced for each process (default 1)
+                arr[count] = (0, mapping[key], mapping[key],
+                    geomapping[data[key].get("location", "GLO") or "GLO"],
+                    MAX_INT_32, MAX_INT_32, TYPE_DICTIONARY["production"],
+                    1, np.NaN, np.NaN, np.NaN, False)
+                count += 1
 
+        # The array is too big, because it can include a default production
+        # amount for each activity. Trim to actual size.
+        arr = arr[:count]
         filepath = os.path.join(config.dir, "processed", "%s.pickle" % \
             self.database)
         with open(filepath, "wb") as f:

File bw2data/errors.py

     pass
 
 
-class UnknownExchange(StandardError):
-    pass
-
-
 class UnknownObject(StandardError):
     pass

File bw2data/io/__init__.py

+from bw2package import BW2PackageExporter, BW2PackageImporter, \
+    download_biosphere, download_methods
+from export_gexf import DatabaseToGEXF
 from import_ecospold import EcospoldImporter
 from import_method import EcospoldImpactAssessmentImporter
-from export_gexf import DatabaseToGEXF

File bw2data/io/bw2package.py

+# -*- coding: utf-8 -*
+from .. import Database, databases, config, JsonWrapper
+from ..utils import database_hash, download_file
+import bz2
+import os
+
+
+class BW2PackageExporter(object):
+    @classmethod
+    def export_ia_method(cls, name):
+        assert name in methods, "Can't find this IA method"
+
+    def export_database(cls, name, include_dependencies):
+        assert name in databases, "Can't find this database"
+        if include_dependencies:
+            for dependency in databases[name]["depends"]:
+                assert dependency in databases, \
+                    "Can't find dependent database %s" % dependency
+            to_export = [name] + databases[name]["depends"]
+            filename = name + ".fat.bw2package"
+        else:
+            to_export = [name]
+            filename = name + ".bw2package"
+        filepath = os.path.join(config.request_dir("export"), filename)
+        with bz2.BZ2File(filepath, "w") as f:
+            f.write(JsonWrapper.dumps({db_name: {
+                "metadata": databases[db_name],
+                "data": {k[1]: v for k, v in Database(db_name).load().iteritems()}
+                } for db_name in to_export}))
+        return filepath
+
+    @classmethod
+    def export(cls, name, include_dependencies=False):
+        if isinstance(name, (tuple, list)):
+            return cls.export_ia_method(name)
+        elif isinstance(name, basestring):
+            return cls.export_database(name, include_dependencies)
+        else:
+            raise ValueError("Unknown input data")
+
+
+class BW2PackageImporter(object):
+    @classmethod
+    def importer(cls, filepath, overwrite=False):
+        if overwrite:
+            raise NotImplementedError
+        if filepath.split(".")[-1] == "bw2package":
+            return cls.import_database(filepath, overwrite)
+        elif filepath.split(".")[-1] == "bw2iapackage":
+            return cls.import_method(filepath, overwrite)
+        else:
+            raise ValueError("Unknown input data")
+
+    @classmethod
+    def import_database(cls, filepath, overwrite):
+        if overwrite:
+            raise NotImplementedError
+        with bz2.BZ2File(filepath) as f:
+            package_data = JsonWrapper.loads(f.read())
+        for name, data in package_data.iteritems():
+            db_data = dict([((name, key), value) for key, value in \
+                data["data"].iteritems()])
+            if name in databases:
+                raise ValueError("Dataase %s already exists" % name)
+            metadata = data["metadata"]
+            database = Database(name)
+            database.register(
+                format=metadata["from format"],
+                depends=metadata["depends"],
+                num_processes=metadata["number"],
+                version=metadata["version"]
+                )
+            database.write(db_data)
+
+    @classmethod
+    def import_method(cls, filepath, overwrite):
+        if overwrite:
+            raise NotImplementedError
+        pass
+
+
+def download_biosphere():
+    filepath = download_file("biosphere.bw2package")
+    BW2PackageImporter.importer(filepath)
+
+
+def download_methods():
+    filepath = download_file("methods.bw2iapackage")
+    BW2PackageImporter.importer(filepath)

File bw2data/io/import_ecospold.py

 # -*- coding: utf-8 -*
 from __future__ import division
 from .. import Database, mapping
-from ..logs import get_logger
-from ..errors import UnknownExchange
+from ..logs import get_io_logger
+from ..utils import activity_hash
+from ..units import normalize_units
 from lxml import objectify
 from stats_toolkit.distributions import *
+import copy
 import math
+import numpy as np
 import os
+import pprint
 import progressbar
 import warnings
 
 BIOSPHERE = ("air", "water", "soil", "resource")
 
 
-class EcospoldImporter(object):
-    """Import inventory datasets from ecospold XML format.
-
-    Does not have any arguments; instead, instantiate the class, and then import using the ``importer`` method, i.e. ``EcospoldImporter().importer(filepath)``."""
-    def importer(self, path, name, depends=["biosphere", ]):
-        """Import an inventory dataset, or a directory of inventory datasets.
-
-        Args:
-            *path* (str): A filepath or directory.
-
-        """
+class EcospoldDataExtractor(object):
+    def extract(self, path, log):
         data = []
-        log = get_logger(name)
-        log.critical(u"Starting import of %s (from %s)" % (name, path))
         if os.path.isdir(path):
             files = [os.path.join(path, y) for y in filter(
                 lambda x: x[-4:].lower() == ".xml", os.listdir(path))]
         else:
             files = [path]
-
-        widgets = ['Files: ', progressbar.Percentage(), ' ',
+        widgets = ['Extracting data: ', progressbar.Percentage(), ' ',
             progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',
             progressbar.ETA()]
         pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(files)
                 continue
 
             for dataset in root.iterchildren():
-                data.append(self._process_dataset(dataset))
+                data.append(self.process_dataset(dataset))
 
             pbar.update(index)
+        pbar.finish()
+        return data
 
-        # Hackish
-        for o in data:
-            try:
-                o["code"] = int(o["code"])
-            except:
-                pass
+    def process_dataset(self, dataset):
+        ref_func = dataset.metaInformation.processInformation.\
+            referenceFunction
+        data = {
+            "name": ref_func.get("name"),
+            "type": "process",  # True for all ecospold?
+            "categories": [ref_func.get("category"), ref_func.get(
+                "subCategory")],
+            "location": dataset.metaInformation.processInformation.\
+                geography.get("location"),
+            "code": int(dataset.get("number")),
+            "unit": normalize_units(ref_func.get("unit")),
+            "exchanges": self.process_exchanges(dataset)
+            }
+        # Convert ("foo", "unspecified") to ("foo",)
+        while data["categories"] and data["categories"][-1] in (
+                "unspecified", None):
+            data["categories"] = data["categories"][:-1]
+        return data
 
-        # Fix exchanges
-        codes = set([o["code"] for o in data])
+    def process_exchanges(self, dataset):
+        data = []
+        # Skip definitional exchange - we assume this already
+        for exc in dataset.flowData.iterchildren():
+            if exc.tag == "{http://www.EcoInvent.org/EcoSpold01}exchange":
+                data.append(self.process_exchange(exc, dataset))
+            elif exc.tag == "{http://www.EcoInvent.org/EcoSpold01}allocation":
+                data.append(self.process_allocation(exc, dataset))
+            else:
+                raise ValueError("Flow data type %s no understood" % exc.tag)
+        return data
+
+    def process_allocation(self, exc, dataset):
+        return {
+            "reference": int(exc.get("referenceToCoProduct")),
+            "fraction": float(exc.get("fraction")),
+            "exchanges": [int(c.text) for c in exc.iterchildren()]
+        }
+
+    def process_exchange(self, exc, dataset):
+        # if exc.get("name") == dataset.metaInformation.processInformation.\
+        #         referenceFunction.get("name") != None and float(
+        #         exc.get("meanValue", 0.)) == 1.0:
+        #     continue
+
+        data = {
+            "code": int(exc.get("number")),
+            "matching": {
+                "categories": (exc.get("category"), exc.get("subCategory")),
+                "location": exc.get("location"),
+                "unit": normalize_units(exc.get("unit")),
+                "name": exc.get("name")
+                }
+            }
+
+        try:
+            data["group"] = int(exc.getchildren()[0].text)
+        except:
+            pass
+
+        # Convert ("foo", "unspecified") to ("foo",)
+        while data["matching"]["categories"] and \
+                data["matching"]["categories"][-1] in ("unspecified", None):
+            data["matching"]["categories"] = \
+                data["matching"]["categories"][:-1]
+
+        if exc.get("generalComment"):
+            data["comment"] = exc.get("generalComment")
+        return self.process_uncertainty_fields(exc, data)
+
+    def process_uncertainty_fields(self, exc, data):
+        uncertainty = int(exc.get("uncertaintyType", 0))
+        mean = exc.get("meanValue")
+        min_ = exc.get("minValue")
+        max_ = exc.get("maxValue")
+        sigma = exc.get("standardDeviation95")
+
+        if uncertainty == 1:
+            # Lognormal
+            data.update({
+                'uncertainty type': LognormalUncertainty.id,
+                'amount': float(mean),
+                'sigma': math.log(math.sqrt(float(sigma)))
+                })
+            if data['sigma'] == 0:
+                # Bad ecoinvent data
+                data['uncertainty type'] = UndefinedUncertainty.id
+                del data["sigma"]
+        elif uncertainty == 2:
+            # Normal
+            data.update({
+                'uncertainty type': NormalUncertainty.id,
+                'amount': float(mean),
+                'sigma': float(sigma) / 2
+                })
+        elif uncertainty == 3:
+            # Triangular
+            data.update({
+                'uncertainty type': TriangularUncertainty.id,
+                'minimum': float(min_),
+                'maximum': float(max_)
+                })
+            # Sometimes this isn't included (though it SHOULD BE)
+            if exc.get("mostLikelyValue"):
+                data['amount'] = float(exc.get("mostLikelyValue"))
+            else:
+                data['amount'] = float(mean)
+        elif uncertainty == 4:
+            # Uniform
+            data.update({
+                'uncertainty type': UniformUncertainty.id,
+                'amount': float(mean),
+                'minimum': float(min_),
+                'maximum': float(max_)
+                })
+        else:
+            # None
+            data.update({
+                'uncertainty type': UndefinedUncertainty.id,
+                'amount': float(mean)
+            })
+        return data
+
+
+class EcospoldImporter(object):
+    """Import inventory datasets from ecospold XML format.
+
+    Does not have any arguments; instead, instantiate the class, and then import using the ``importer`` method, i.e. ``EcospoldImporter().importer(filepath)``."""
+    def importer(self, path, name, depends=["biosphere"]):
+        """Import an inventory dataset, or a directory of inventory datasets.
+
+        .. image:: images/import-method.png
+            :align: center
+
+        Args:
+            *path* (str): A filepath or directory.
+
+        """
+
+        self.log, self.logfile = get_io_logger("lci-import")
+        self.new_activities = []
+        self.new_biosphere = []
+
+        data = EcospoldDataExtractor().extract(path, self.log)
+        data = self.allocate_datasets(data)
+        data = self.apply_transforms(data)
+        data = self.add_hashes(data)
+
+        widgets = ['Linking exchanges:', progressbar.Percentage(), ' ',
+            progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',
+            progressbar.ETA()]
+        pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(data)
+            ).start()
+
+        linked_data = []
+        for index, ds in enumerate(data):
+            linked_data.append(self.link_exchanges(ds, data, depends, name))
+            pbar.update(index)
+        pbar.finish()
+
+        data = linked_data + self.new_activities
+
+        if self.new_biosphere:
+            self.new_biosphere = dict([((u"biosphere", o.pop("hash")), o) \
+                for o in self.new_biosphere])
+            biosphere = Database("biosphere")
+            biosphere_data = biosphere.load()
+            biosphere_data.update(self.new_biosphere)
+            biosphere.write(biosphere_data)
+            # biosphere.process()
+
+        data = self.set_exchange_types(data)
+        data = self.clean_exchanges(data)
+        # Dictionary constructor eliminates duplicate activities
+        data = dict([((name, o.pop("hash")), o) for o in data])
+        self.write_database(name, data, depends)
+
+    def allocate_datasets(self, data):
+        activities = []
+        for ds in data:
+            multi_output = [exc for exc in ds["exchanges"] \
+                if "reference" in exc]
+            if multi_output:
+                for activity in self.allocate_exchanges(ds):
+                    activities.append(activity)
+            else:
+                activities.append(ds)
+        return activities
+
+    def allocate_exchanges(self, ds):
+        """
+Take a dataset, which has multiple outputs, and return a list of allocated datasets.
+
+Two things change in the allocated datasets. First, the name changes to the names of the individual outputs. Second, the list of exchanges is rewritten, and only the allocated exchanges are used.
+
+The list of ``exchanges`` looks like:
+
+.. code-block:: python
+
+    [
+        {
+        "code": 2,
+        "matching": {
+            "categories": data,
+            "location": data,
+            "unit": data,
+            "name": name
+            },
+        "uncertainty type": data
+        "amount": reference amount,
+        "group": 2
+        }, {
+        "code": 4,
+        "matching": {data},
+        "uncertainty type": data
+        "amount": 1,
+        "group": 5
+        }, {
+        "reference": 2,
+        "fraction": 0.5,
+        "exchanges": [4,]
+        }
+    ]
+
+It should be changed to:
+
+.. code-block:: python
+
+    [
+        {
+        "code": 2,
+        "matching": {
+            "categories": data,
+            "location": data,
+            "unit": data,
+            "name": name
+            },
+        "uncertainty type": data
+        "amount": number,
+        "group": 2
+        }, {
+        "code": 4,
+        "matching": {data},
+        "uncertainty type": data
+        "amount": 0.5,
+        "group": 5
+        }
+    ]
+
+Exchanges should also be copied and allocated for any other co-products.
+
+        """
+        coproduct_codes = [exc["code"] for exc in ds["exchanges"] if exc.get(
+            "group", None) == 2]
+        coproducts = dict([(x, copy.deepcopy(ds)) for x in coproduct_codes])
+        exchanges = dict([(exc["code"], exc) for exc in ds["exchanges"
+            ] if "code" in exc])
+        allocations = [a for a in ds["exchanges"] if "fraction" in a]
+        # First, get production amounts for each coproduct.
+        # these aren't included in the allocations
+        for key, product in coproducts.iteritems():
+            product["exchanges"] = [exc for exc in product["exchanges"] if exc.get("code", None) == key]
+        # Next, correct names, location, and unit
+        for key, product in coproducts.iteritems():
+            for label in ("unit", "name", "location"):
+                if exchanges[key]["matching"].get(label, None):
+                    product[label] = exchanges[key]["matching"][label]
+        # Finally, add the allocated exchanges
+        for allocation in allocations:
+            if allocation["fraction"] == 0:
+                continue
+            product = coproducts[allocation["reference"]]
+            for exc_code in allocation["exchanges"]:
+                copied = copy.deepcopy(exchanges[exc_code])
+                copied["amount"] = copied["amount"] * allocation["fraction"]
+                product["exchanges"].append(copied)
+        return coproducts.values()
+
+    def apply_transforms(self, data):
+        # Reserved for sublcasses, e.g. SimaPro import
+        # where some cleaning is necessary...
+        return data
+
+    def add_hashes(self, ds):
+        for o in ds:
+            o["hash"] = activity_hash(o)
+        return ds
+
+    def link_exchanges(self, ds, data, depends, name):
+        if self.sequential_exchanges(ds):
+            del ds["code"]
+        ds["exchanges"] = [self.link_exchange(exc, ds, data, depends, name
+            ) for exc in ds["exchanges"]]
+        return ds
+
+    def sequential_exchanges(self, ds):
+        codes = np.array([x["code"] for x in ds["exchanges"]])
+        return np.allclose(np.diff(codes), np.ones(np.diff(codes).shape))
+
+    def link_exchange(self, exc, ds, data, depends, name):
+        # Has to happen before others because US LCI doesn't define categories
+        # for product definitions...
+        if exc.get("group", None) == 0:
+            # Activity dataset production
+            exc["input"] = (name, activity_hash(ds))
+            return exc
+        # Hack for US LCI-specific bug - both "Energy recovered"
+        # and "Energy, recovered" are present
+        elif exc["matching"]["categories"] == () and \
+                exc["matching"]["name"] == "Recovered energy":
+            exc["matching"].update(
+                name="Energy, recovered",
+                categories=("resource",),
+                )
+        elif not exc["matching"]["categories"]:
+            # US LCI doesn't list categories, subcategories for
+            # technosphere inputs. Try to find based on name. Need to lowercase
+            # because US LCI is not consistent within itself (!!!)
+            for other_ds in data:
+                if other_ds["name"].lower() == \
+                        exc["matching"]["name"].lower():
+                    exc["input"] = (name, other_ds["hash"])
+                    return exc
+            # Can't find matching process - but could be a US LCI "dummy"
+            # activity
+            if exc["matching"]["name"][:5].lower() == "dummy":
+                self.log.warning(u"New activity created by:\n%s" % \
+                    pprint.pformat(exc))
+                exc["input"] = (name, self.create_activity(exc["matching"]))
+                return exc
+            else:
+                raise ValueError("Exchange can't be matched:\n%s" % \
+                    pprint.pformat(exc))
+        exc["hash"] = activity_hash(exc["matching"])
+        if exc["matching"].get("categories", [None])[0] in BIOSPHERE:
+            return self.link_biosphere(exc)
+        else:
+            return self.link_activity(exc, ds, data, depends, name)
+
+    def link_biosphere(self, exc):
+        exc["input"] = ("biosphere", exc["hash"])
+        if (u"biosphere", exc["hash"]) in Database("biosphere").load():
+            return exc
+        else:
+            new_flow = copy.deepcopy(exc["matching"])
+            new_flow.update({
+                "hash": activity_hash(exc["matching"]),
+                "type": "resource" if new_flow["categories"][0] == "resource" \
+                    else "emission",
+                "exchanges": []
+                })
+            # Biosphere flows don't have locations
+            del new_flow["location"]
+            self.new_biosphere.append(new_flow)
+            return exc
+
+    def link_activity(self, exc, ds, data, depends, name):
+        if exc["hash"] in [o["hash"] for o in data]:
+            exc["input"] = (name, exc["hash"])
+            return exc
+        else:
+            return self.link_activity_dependent_database(exc, depends, name)
+
+    def link_activity_dependent_database(self, exc, depends, name):
+        for database in depends:
+            if (database, exc["hash"]) in mapping:
+                exc["input"] = (database, exc["hash"])
+                return exc
+        # Create new activity in this database and log
+        self.log.warning(u"New activity created by:\n%s" % pprint.pformat(exc))
+        exc["input"] = (name, self.create_activity(exc["matching"]))
+        return exc
+
+    def create_activity(self, exc):
+        exc = copy.deepcopy(exc)
+        exc.update({
+            "exchanges": [],
+            "type": "process",
+            "hash": activity_hash(exc),
+            })
+        self.new_activities.append(exc)
+        return exc["hash"]
+
+    def set_exchange_types(self, data):
+        """Set the ``type`` attribute for each exchange, one of either (``production``, ``technosphere``, ``biosphere``). ``production`` defines the amount produced by the activity dataset (default is 1)."""
         for ds in data:
             for exc in ds["exchanges"]:
-                code = exc["code"]
-                # Hack - not work with others?
-                try:
-                    code = int(code)
-                except:
-                    pass
-                if code in codes:
-                    exc["input"] = (name, code)
+                if exc["input"][0] == "biosphere":
+                    exc["type"] = "biosphere"
+                elif exc["input"][1] == ds["hash"]:
+                    exc["type"] = "production"
                 else:
-                    exc["input"] = self._find_in_dependent_database(code,
-                        exc, depends)
-                exc["technosphere"] = exc["input"][0] != "biosphere"
+                    exc["type"] = "technosphere"
+        return data
 
-            # Sort for consistent order to make import comparisons easier
-            ds["exchanges"].sort(key=lambda x: x["input"])
+    def clean_exchanges(self, data):
+        for ds in data:
+            for exc in ds["exchanges"]:
+                if "matching" in exc:
+                    del exc["matching"]
+                if "hash" in exc:
+                    del exc["hash"]
+        return data
 
-        data = dict([((name, int(o["code"])), o) for o in data])
-
+    def write_database(self, name, data, depends):
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             manager = Database(name)
             manager.register(("Ecospold", 1), depends, len(data))
             manager.write(data)
-            manager.process()
-
-        pbar.finish()
-
-    def _find_in_dependent_database(self, code, exc, depends):
-        for db in depends:
-            if (db, code) in mapping:
-                return (db, code)
-
-        # Add new biosphere flow if needed
-        if exc["_matching"].get("categories", [None, ])[0] in BIOSPHERE:
-            data = exc["_matching"]
-
-            # Emission or resource
-            resource = data["categories"][0] == "resource"
-            data["type"] = "resource" if resource else "emission"
-
-            # Biosphere flows don't have locations or exchanges
-            del data["location"]
-            data["exchanges"] = []
-
-            # Write modified biosphere database
-            biosphere = Database("biosphere")
-            bio_data = biosphere.load()
-            bio_data[("biosphere", code)] = data
-            biosphere.write(bio_data)
-            return ("biosphere", code)
-        raise UnknownExchange(("The exchange %s couldn't be " + \
-            "matched to this or a depending database") % code)
-
-    def _process_dataset(self, dataset):
-        data = {}
-        ref_func = dataset.metaInformation.processInformation.\
-            referenceFunction
-
-        data["name"] = ref_func.get("name")
-        data["type"] = "process"  # True for all ecospold?
-        data["categories"] = [ref_func.get("category"), ref_func.get(
-            "subCategory")]
-        # Convert ("foo", "unspecified") to ("foo",)
-        while data["categories"][-1] == "unspecified":
-            data["categories"] = data["categories"][:-1]
-        data["location"] = dataset.metaInformation.processInformation.\
-            geography.get("location")
-        data["code"] = dataset.get("number")
-        data["unit"] = ref_func.get("unit")
-        data["exchanges"] = self._process_exchanges(dataset)
-        return data
-
-    def _process_exchanges(self, dataset):
-        data = []
-        # Skip definitional exchange - we assume this already
-        for exc in dataset.flowData.iterchildren():
-            if exc.get("name") == dataset.metaInformation.processInformation.\
-                    referenceFunction.get("name") != None and float(
-                    exc.get("meanValue", 0.)) == 1.0:
-                continue
-
-            this = {
-                "code": int(exc.get("number")),
-                "_matching": {
-                    "categories": (exc.get("category"), exc.get("subCategory")),
-                    "location": exc.get("location"),
-                    "unit": exc.get("unit"),
-                    "name": exc.get("name")
-                    }
-                }
-
-            if exc.get("generalComment"):
-                this["pedigree matrix"] = exc.get("generalComment")
-
-            uncertainty = int(exc.get("uncertaintyType", 0))
-            mean = exc.get("meanValue")
-            min_ = exc.get("minValue")
-            max_ = exc.get("maxValue")
-            sigma = exc.get("standardDeviation95")
-
-            if uncertainty == 1:
-                # Lognormal
-                this.update({
-                    'uncertainty type': LognormalUncertainty.id,
-                    'amount': float(mean),
-                    'sigma': math.log(math.sqrt(float(sigma)))
-                    })
-                if this['sigma'] == 0:
-                    # Bad ecoinvent data
-                    this['uncertainty type'] = UndefinedUncertainty.id
-                    del this["sigma"]
-            elif uncertainty == 2:
-                # Normal
-                this.update({
-                    'uncertainty type': NormalUncertainty.id,
-                    'amount': float(mean),
-                    'sigma': float(sigma) / 2
-                    })
-            elif uncertainty == 3:
-                # Triangular
-                this.update({
-                    'uncertainty type': TriangularUncertainty.id,
-                    'minimum': float(min_),
-                    'maximum': float(max_)
-                    })
-                # Sometimes this isn't included (though it SHOULD BE)
-                if exc.get("mostLikelyValue"):
-                    this['amount'] = float(exc.get("mostLikelyValue"))
-                else:
-                    this['amount'] = float(mean)
-            elif uncertainty == 4:
-                # Uniform
-                this.update({
-                    'uncertainty type': UniformUncertainty.id,
-                    'amount': float(mean),
-                    'minimum': float(min_),
-                    'maximum': float(max_)
-                    })
-            else:
-                # None
-                this.update({
-                    'uncertainty type': UndefinedUncertainty.id,
-                    'amount': float(mean)
-                })
-
-            data.append(this)
-
-        return data
+            # manager.process()

File bw2data/io/import_method.py

 # -*- coding: utf-8 -*
-from .. import Database, Method, methods
+from .. import Database, Method, methods, mapping
 from ..logs import get_io_logger
+from ..units import normalize_units
+from ..utils import activity_hash
 from lxml import objectify
-import numpy as np
 import os
 import pprint
 import progressbar
         else:
             files = [path]
 
-        self.new_flows = False
         self.log, self.logfile = get_io_logger("lcia-import")
 
         try:
             # Biosphere not loaded
             raise ValueError("Can't find biosphere database; check configuration.")
 
-        self.max_code = max(50000, max([x[1] for x in self.biosphere_data]))
-
         if progressbar:
             widgets = ['Files: ', progressbar.Percentage(), ' ',
                 progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',
         pbar.finish()
 
     def add_method(self, ds):
+        self.new_flows = []
         ref_func = ds.metaInformation.processInformation.referenceFunction
         name = (ref_func.get("category"), ref_func.get("subCategory"),
             ref_func.get("name"))
-        assert name not in methods
+        assert name not in methods, "%s already imported" % str(name)
         description = ref_func.get("generalComment") or ""
         unit = ref_func.get("unit") or ""
-
-        # Check if codes are sequential
-        codes = np.array([int(cf.get("number")) for cf in ds.flowData.iterchildren()])
-        sequential = np.allclose(np.diff(codes), np.ones(np.diff(codes).shape))
-
-        if sequential:
-            data = self.add_sequential_cfs(ds)
-        else:
-            data = self.add_nonsequential_cfs(ds)
+        data = [self.add_cf(o) for o in ds.flowData.iterchildren()]
 
         if self.new_flows:
             biosphere = Database("biosphere")
-            biosphere.write(self.biosphere_data)
+            biosphere_data = biosphere.load()
+            # Could be considered dirty to .pop() inside list comprehension
+            # but it works. The dictionary constructor also eliminates
+            # duplicates.
+            biosphere_data.update(dict([(("biosphere", o.pop("hash")), o
+                ) for o in self.new_flows]))
+            biosphere.write(biosphere_data)
             biosphere.process()
 
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             method = Method(name)
             method.register(unit, description, len(data))
-            method.write(data)
+            method.write(dict([(("biosphere", o[0]), o[1]) for o in data]))
             method.process()
 
-    def add_sequential_cfs(self, ds):
-        data = {}
-        for cf in ds.flowData.iterchildren():
-            cf_data = self.get_cf_data(cf, ignore_code=True)
-            cf_data = self.match_biosphere_by_attrs(cf_data)
-            data[("biosphere", cf_data["code"])] = float(cf.get("meanValue"))
-        return data
+    def add_cf(self, cf):
+        data = self.get_cf_data(cf)
+        if ("biosphere", data["hash"]) not in mapping:
+            self.add_flow(data)
+        return (data["hash"], float(cf.get("meanValue")))
 
-    def add_nonsequential_cfs(self, ds):
-        data = {}
-        for cf in ds.flowData.iterchildren():
-            cf_data = self.get_cf_data(cf)
-            cf_data = self.code_in_biosphere(cf_data)
-            data[("biosphere", cf_data["code"])] = float(cf.get("meanValue"))
-        return data
-
-    def match_biosphere_by_attrs(self, cf):
-        found = False
-        for key, value in self.biosphere_data.iteritems():
-            if self.verify_attrs(cf, value):
-                found = key[1]
-                break
-        if found:
-            cf["code"] = found
-            return cf
-        else:
-            cf["code"] = self.get_new_code()
-            self.log_info(cf)
-            self.add_flow(cf)
-            return cf
-
-    def get_new_code(self):
-        self.max_code += 1
-        return self.max_code
-
-    def verify_attrs(self, cf, bio):
-        return bio["name"].lower() == cf["name"].lower() and \
-            list(bio["categories"]) == cf["categories"]
-
-    def code_in_biosphere(self, cf):
-        key = ("biosphere", cf["code"])
-        if key in self.biosphere_data:
-            if self.verify_attrs(cf, self.biosphere_data[key]):
-                return cf
-            else:
-                self.log_warning(cf)
-        return self.match_biosphere_by_attrs(cf)
-
-    def log_warning(self, cf):
-        error_message = "Found biosphere flow with same code but conflicting attributes:\n"
-        error_message += "\tExisting version:\n"
-        error_message += pprint.pformat(self.biosphere_data[("biosphere", cf["code"])])
-        error_message += "\tNew version:\n"
-        error_message += pprint.pformat(cf)
-        self.log.warning(error_message)
-
-    def log_info(self, cf):
-        log_message = "Adding new biosphere flow:\n"
-        log_message += pprint.pformat(cf)
-        self.log.info(log_message)
-
-    def get_int_code(self, cf):
-        try:
-            return int(cf.get("number"))
-        except:
-            raise ValueError("Can't convert `number` attribute to number")
-
-    def get_cf_data(self, cf, ignore_code=False):
+    def get_cf_data(self, cf):
         data = {
             "name": cf.get("name"),
             "categories": [cf.get("category"),
                 cf.get("subCategory") or "unspecified"],
-            "unit": cf.get("unit"),
-            "exchanges": []
-        }
+            "unit": normalize_units(cf.get("unit")),
+            }
         # Convert ("foo", "unspecified") to ("foo",)
         while data["categories"][-1] == "unspecified":
             data["categories"] = data["categories"][:-1]
-        if not ignore_code:
-            data["code"] = self.get_int_code(cf)
+        data["hash"] = activity_hash(data)
         return data
 
     def add_flow(self, cf):
-        """Add new biosphere flow"""
-        # Emission or resource
-        resource = cf["categories"][0] == "resource"
-        cf["type"] = "resource" if resource else "emission"
-        self.new_flows = True
-        self.biosphere_data[("biosphere", cf["code"])] = cf
+        self.log.warning("Adding new biosphere flow:\n%s" % pprint.pformat(cf))
+        cf.update({
+            "exchanges": [],
+            "type": "resource" if cf["categories"][0] == "resource" \
+                else "emission"
+            })
+        self.new_flows.append(cf)

File bw2data/tests/base.py

 class BW2DataTest(unittest.TestCase):
     def setUp(self):
         config.dont_warn = True
+        config.cache = {}
         config.dir = tempfile.mkdtemp()
         config.create_basic_directories()
         reset_meta()

File bw2data/tests/fixtures.py

         'code': 1,
         'exchanges': [],
         'name': 'an emission',
-        'type': 'process',
+        'type': 'emission',
         'unit': 'kg'
         },
     ("biosphere", 2): {
         'categories': ['things'],
         'code': 2,
         'exchanges': [],
-        'type': 'another emission',
+        'type': 'emission',
+        'name': 'another emission',
         'unit': 'kg'
         },
 }
         'exchanges': [{
             'amount': 0.5,
             'input': ('food', 2),
-            'technosphere': True,
+            'type': 'technosphere',
             'uncertainty type': 0},
             {'amount': 0.05,
             'input': ('biosphere', 1),
-            'technosphere': False,
+            'type': 'biosphere',
             'uncertainty type': 0}],
         'location': 'CA',
         'name': 'lunch',
         'exchanges': [{
             'amount': 0.25,
             'input': ('food', 1),
-            'technosphere': True,
+            'type': 'technosphere',
             'uncertainty type': 0},
             {'amount': 0.15,
             'input': ('biosphere', 2),
-            'technosphere': False,
+            'type': 'biosphere',
             'uncertainty type': 0}],
         'location': 'CH',
         'name': 'dinner',

File bw2data/tests/geo.py

         pickled = pickle.load(open(os.path.join(config.dir, "processed",
             database.database + ".pickle"), "rb"))
         self.assertTrue(np.allclose(pickled["geo"],
-            geomapping["GLO"] * np.ones((4,))))
+            geomapping["GLO"] * np.ones(pickled.shape)))
 
     def test_method_write_adds_to_geomapping(self):
         self.add_method(False)

File bw2data/units.py

+UNITS_NORMALIZATION = {
+    "mj": u"megajoule",
+    "kg": u"kilogram",
+    "m3": u"cubic meter",
+    "m2": u"square meter",
+    'm3a': u"cubic meter-year",
+    "m2a": u"square meter-year",
+}
+
+normalize_units = lambda x: UNITS_NORMALIZATION.get(x.lower(), x)

File bw2data/utils.py

 # -*- coding: utf-8 -*-
+from . import config
+import hashlib
+import os
 import random
 import re
+import requests
 import string
 
 # Maximum value for unsigned integer stored in 4 bytes
 MAX_INT_32 = 4294967295
 
+TYPE_DICTIONARY = {
+    "production": 0,
+    "technosphere": 1,
+    "biosphere": 2,
+    }
+
+DOWNLOAD_URL = "http://secret.brightwaylca.org/data/"
+
 
 def natural_sort(l):
     """Sort the given list in the way that humans expect"""
     return sorted(l, key=alphanum_key)
 
 
+def recursively_sort(obj):
+    if isinstance(obj, dict):
+        return sorted([(k, recursively_sort(v)) for k, v in obj.iteritems()])
+    elif hasattr(obj, "__iter__"):
+        return sorted((recursively_sort(x) for x in obj))
+    else:
+        return obj
+
+
 def random_string(length):
     return ''.join(random.choice(string.letters + string.digits
         ) for i in xrange(length))
     method.register(**meta)
     method.write(data)
     method.process()
+
+
+def database_hash(data):
+    return hashlib.md5(unicode(recursively_sort(data))).hexdigest()
+
+
+def activity_hash(data):
+    string = (data["name"].lower() + \
+        u"".join(data["categories"]) + \
+        (data.get("unit", u"") or u"").lower() + \
+        (data.get("location", u"") or u"").lower())
+    return unicode(hashlib.md5(string.encode('utf-8')).hexdigest())
+
+
+def download_file(filename):
+    dirpath = config.request_dir("downloads")
+    filepath = os.path.join(dirpath, filename)
+    download = requests.get(DOWNLOAD_URL + filename, prefetch=False).raw
+    chunk = 128 * 1024
+    with open(filepath, "wb") as f:
+        while True:
+            segment = download.read(chunk)
+            if not segment:
+                break
+            f.write(segment)
+    return filepath

File bw2data/validate.py

     return o
 
 db_validator = Schema({valid_tuple: {
-    required("code"): object,
+    "code": object,
     "categories": list or tuple,
     "location": object,
     required("name"): basestring,
-    required("type"): "process" or "emission" or "resource",
-    "unit": basestring,
+    required("type"): basestring,
+    required("unit"): basestring,
     required("exchanges"): [{
         required("amount"): float,
-        required("input"): object,
-        "pedigree matrix": basestring,
+        required("input"): valid_tuple,
+        "comment": basestring,
         "code": object,
         "sigma": float,
-        required("technosphere"): bool,
-        required("uncertainty type"): int
+        required("uncertainty type"): int,
+        required("type"): basestring,
         }]
     }},
     extra=True)
 
-ia_validator = Schema({valid_tuple: float})
+ia_validator = Schema({valid_tuple: float or (object, float)})

File docs/conf.py

 
 # General information about the project.
 project = u'bw2data'
-copyright = u'2012, Chris Mutel'
+copyright = u'2013, Chris Mutel'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.8'
+version = '0.9'
 # The full version, including alpha/beta/rc tags.
-release = '0.8'
+release = '0.9.0-alpha'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

File docs/images/import-database.png

Added
New image

File docs/images/import-method.png

Old
Old image
New
New image

File docs/index.rst

    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to bw2data's documentation!
-===================================
+Brightway2-data
+===============
 
-Contents:
+This is the technical documentation for Brightway2-data, part of the `Brightway2 <http://brightwaylca.org>`_ life cycle assessment calculation framework. The following online resources are available:
+
+* `Source code <https://bitbucket.org/cmutel/brightway2-data>`_
+* `Documentation on Read the Docs <http://bw2data.readthedocs.org>`_
+* `Test coverage <http://coverage.brightwaylca.org/data/index.html>`_
+
+Running tests
+-------------
+
+To run the tests, install `nose <https://nose.readthedocs.org/en/latest/>`_, and run ``nosetests``.
+
+Building the documentation
+--------------------------
+
+Install `sphinx <http://sphinx.pocoo.org/>`_, and then change to the ``docs`` directory, and run ``make html`` (or ``make.bat html`` in Windows).
+
+Table of Contents
+-----------------
 
 .. toctree::
    :maxdepth: 2
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-

File docs/technical.rst

 Modular structure
 =================
 
-Brightway2 is a framework for life cycle assessment, and consists of several packages. You only need to install or understand the components that are of interest to you. Splitting components allow for a clean separation of concerns, as each package has a limited focus, and makes testing and documenting each package easier and cleaner. 
+Brightway2 is a framework for life cycle assessment, and consists of several packages. You only need to install or understand the components that are of interest to you. Splitting components allow for a clean separation of concerns, as each package has a limited focus, and makes testing and documenting each package easier and cleaner.
 
 This guide has technical details for the ``bw2data`` package. Each separate package also has its own documentation.
 
 * bw2calc: The LCA calculators. Normal LCA, several varieties of Monte Carlo LCA (including parallel Monte Carlo using all the cores on your computer), Latin Hypercubic sampling, and graph traversal.
 * bw2analyzer: Functions for analyzing the results of LCA calculations, including contribution and sensitivity analysis.
 * bw2ui: Two different user interfaces for Brightway2. Brightway2 is pure Python, and can be used by other programs or in an ipython notebook. For people who aren't as comfortable programming in Python, this packages provides a command line interface for work in the terminal, and a web interface.
-* bw2speedups: A few small utilities that can make calculations faster. Requires Cython.
 
 Configuration
 =============
 
-The configuration for brightway2 (which is currently only the location of the data directory) is implemented as a singleton class that is created when ``brightway2`` is imported.
+The configuration for brightway2 is implemented as a singleton class that is created when ``brightway2`` is imported.
 
 .. autoclass:: bw2data._config.Config
     :members:
     :members:
     :inherited-members:
 
-
-Documentation
-=============
-
-Documentation is uses `Sphinx <http://sphinx.pocoo.org/>`_ to build the source files into other forms. Building is as simple as changing to the docs directory, and running ``make.bat html`` in Windows and ``make html`` in OS X and Linux.
-
 Database
 ========
 
 Import and Export
 =================
 
+.. autoclass:: bw2data.io.BW2PackageImporter
+    :members:
+
+.. autoclass:: bw2data.io.BW2PackageExporter
+    :members:
+
 .. autoclass:: bw2data.io.EcospoldImporter
     :members:
 
 
 setup(
   name='bw2data',
-  version="0.8.4.2",
+  version="0.9.0-alpha",
   packages=packages,
   author="Chris Mutel",
   author_email="cmutel@gmail.com",
   license=open('LICENSE.txt').read(),
-  install_requires=["voluptuous", "progressbar", "numpy", "lxml", "scipy", "brightway2"],
+  install_requires=["voluptuous", "progressbar", "numpy", "lxml", "scipy", "requests", "brightway2"],
   url="https://bitbucket.org/cmutel/brightway2-data",
   long_description=open('README.rst').read(),
   classifiers=[