Commits

Chris Mutel committed 00b42cf

0.8.3: Rewritten IA import script is much slower, but correctly handles sequential (i.e. false) id numbers, conflicts, etc., and keeps logs on activity

  • Participants
  • Parent commits 8de885c

Comments (0)

Files changed (3)

bw2data/io/import_method.py

 # -*- coding: utf-8 -*
-from .. import Database, mapping, Method, methods, databases
+from .. import Database, Method, methods
+from ..logs import get_io_logger
 from lxml import objectify
-import hashlib
+import numpy as np
 import os
+import pprint
 import progressbar
 import warnings
 try:
     def importer(self, path):
         """Import an impact assessment method, or a directory of impact assessment methods.
 
+        The flow logic is relatively complex, because:
+            #. We have to make sure the ``number`` attribute is not just a sequential list
+            #. Even if valid biosphere ``number``s are provided, we can't believe them.
+
+        Here is the flow logic graphic:
+
+        .. image:: images/import-method.png
+            :align: center
+
         Args:
             *path* (str): A filepath or directory.
 
         else:
             files = [path]
 
-        self.new_flows = {}
+        self.new_flows = False
+        self.log, self.logfile = get_io_logger("lcia-import")
 
         try:
             self.biosphere_data = Database("biosphere").load()
         except:
             # Biosphere not loaded
             raise ValueError("Can't find biosphere database; check configuration.")
+
+        self.max_code = max(50000, max([x[1] for x in self.biosphere_data]))
+
         if progressbar:
             widgets = ['Files: ', progressbar.Percentage(), ' ',
                 progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',
         ref_func = ds.metaInformation.processInformation.referenceFunction
         name = (ref_func.get("category"), ref_func.get("subCategory"),
             ref_func.get("name"))
+        assert name not in methods
         description = ref_func.get("generalComment") or ""
         unit = ref_func.get("unit") or ""
-        data = {}
-        for cf in ds.flowData.iterchildren():
-            code = self.get_code(cf)
-            if ("biosphere", code) not in mapping:
-                self.add_flow(cf, code)
-            data[("biosphere", code)] = float(cf.get("meanValue"))
-        assert name not in methods
+
+        # Check if codes are sequential
+        codes = np.array([int(cf.get("number")) for cf in ds.flowData.iterchildren()])
+        sequential = np.allclose(np.diff(codes), np.ones(np.diff(codes).shape))
+
+        if sequential:
+            data = self.add_sequential_cfs(ds)
+        else:
+            data = self.add_nonsequential_cfs(ds)
 
         if self.new_flows:
             biosphere = Database("biosphere")
-            bio_data = biosphere.load()
-            bio_data.update(self.new_flows)
-            biosphere.write(bio_data)
+            biosphere.write(self.biosphere_data)
             biosphere.process()
 
         with warnings.catch_warnings():
             method.write(data)
             method.process()
 
-    def get_code(self, cf):
+    def add_sequential_cfs(self, ds):
+        data = {}
+        for cf in ds.flowData.iterchildren():
+            cf_data = self.get_cf_data(cf, ignore_code=True)
+            cf_data = self.match_biosphere_by_attrs(cf_data)
+            data[("biosphere", cf_data["code"])] = float(cf.get("meanValue"))
+        return data
+
+    def add_nonsequential_cfs(self, ds):
+        data = {}
+        for cf in ds.flowData.iterchildren():
+            cf_data = self.get_cf_data(cf)
+            cf_data = self.code_in_biosphere(cf_data)
+            data[("biosphere", cf_data["code"])] = float(cf.get("meanValue"))
+        return data
+
+    def match_biosphere_by_attrs(self, cf):
+        found = False
+        for key, value in self.biosphere_data.iteritems():
+            if self.verify_attrs(cf, value):
+                found = key[1]
+                break
+        if found:
+            cf["code"] = found
+            return cf
+        else:
+            cf["code"] = self.get_new_code()
+            self.log_info(cf)
+            self.add_flow(cf)
+            return cf
+
+    def get_new_code(self):
+        self.max_code += 1
+        return self.max_code
+
+    def verify_attrs(self, cf, bio):
+        return bio["name"].lower() == cf["name"].lower() and \
+            list(bio["categories"]) == cf["categories"]
+
+    def code_in_biosphere(self, cf):
+        key = ("biosphere", cf["code"])
+        if key in self.biosphere_data:
+            if self.verify_attrs(cf, self.biosphere_data[key]):
+                return cf
+            else:
+                self.log_warning(cf)
+        return self.match_biosphere_by_attrs(cf)
+
+    def log_warning(self, cf):
+        error_message = "Found biosphere flow with same code but conflicting attributes:\n"
+        error_message += "\tExisting version:\n"
+        error_message += pprint.pformat(self.biosphere_data[("biosphere", cf["code"])])
+        error_message += "\tNew version:\n"
+        error_message += pprint.pformat(cf)
+        self.log.warning(error_message)
+
+    def log_info(self, cf):
+        log_message = "Adding new biosphere flow:\n"
+        log_message += pprint.pformat(cf)
+        self.log.info(log_message)
+
+    def get_int_code(self, cf):
         try:
-            int_code = int(cf.get("number"))
-            assert int_code
-            return int_code
+            return int(cf.get("number"))
         except:
-            hasher = hashlib.md5()
-            cat_string = "-".join([cf.get("category"), cf.get("subCategory")])
-            hasher.update(cat_string)
-            return hasher.hexdigest()[-8:]
+            raise ValueError("Can't convert `number` attribute to number")
 
-    def add_flow(self, cf, code):
-        """Add new biosphere flow"""
-        new_flow = {
+    def get_cf_data(self, cf, ignore_code=False):
+        data = {
             "name": cf.get("name"),
-            "categories": (cf.get("category"),
-                cf.get("subCategory") or "unspecified"),
-            "code": code,
+            "categories": [cf.get("category"),
+                cf.get("subCategory") or "unspecified"],
             "unit": cf.get("unit"),
             "exchanges": []
         }
+        # Convert ("foo", "unspecified") to ("foo",)
+        while data["categories"][-1] == "unspecified":
+            data["categories"] = data["categories"][:-1]
+        if not ignore_code:
+            data["code"] = self.get_int_code(cf)
+        return data
 
-        # Convert ("foo", "unspecified") to ("foo",)
-        while new_flow["categories"][-1] == "unspecified":
-            new_flow["categories"] = new_flow["categories"][:-1]
-
+    def add_flow(self, cf):
+        """Add new biosphere flow"""
         # Emission or resource
-        resource = new_flow["categories"][0] == "resource"
-        new_flow["type"] = "resource" if resource else "emission"
-        self.new_flows[("biosphere", code)] = new_flow
+        resource = cf["categories"][0] == "resource"
+        cf["type"] = "resource" if resource else "emission"
+        self.new_flows = True
+        self.biosphere_data[("biosphere", cf["code"])] = cf

docs/images/import-method.png

Added
New image
 
 setup(
   name='bw2data',
-  version="0.8.2",
+  version="0.8.3",
   packages=packages,
   author="Chris Mutel",
   author_email="cmutel@gmail.com",