Chris Mutel avatar Chris Mutel committed 4c41e1e

Initial version of stats_arrays functionality

Comments (0)

Files changed (3)

bw2data/database.py

 
     def process(self, version=None):
         """
-Process intermediate data from a Python dictionary to a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type. These structured arrays act as a standard data format for LCA and Monte Carlo calculations, and are the native data format for the Stats Arrays package.
+Process intermediate data from a Python dictionary to a `stats_arrays <>`_ array, which is a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type.
 
 Processed arrays are saved in the ``processed`` directory.
 
-The structure for processed inventory databases is:
+The structure for processed inventory databases includes additional columns beyond the basic ``stats_arrays`` format:
 
 ================ ======== ===================================
 Column name      Type     Description
 row              uint32   column filled with `NaN` values, used for matrix construction
 col              uint32   column filled with `NaN` values, used for matrix construction
 type             uint8    integer type defined in `bw2data.utils.TYPE_DICTIONARY`
-amount           float32  location parameter, e.g. mean
-sigma            float32  shape parameter, e.g. std
+amount           float32  amount without uncertainty
+loc              float32  location parameter, e.g. mean
+scale            float32  scale parameter, e.g. standard deviation
+shape            float32  shape parameter
 minimum          float32  minimum bound
 maximum          float32  maximum bound
 negative         bool     `amount` < 0
             ('col', np.uint32),
             ('type', np.uint8),
             ('amount', np.float32),
-            ('sigma', np.float32),
+            ('loc', np.float32),
+            ('scale', np.float32),
+            ('shape', np.float32),
             ('minimum', np.float32),
             ('maximum', np.float32),
             ('negative', np.bool)
         ]
         arr = np.zeros((num_exchanges + len(data), ), dtype=dtype)
-        arr['minimum'] = arr['maximum'] = arr['sigma'] = np.NaN
+        arr['minimum'] = arr['maximum'] = np.NaN
+        arr['loc'] = arr['scale'] = arr['shape'] = np.NaN
         count = 0
         for key in sorted(data.keys(), key=lambda x: x[1]):
             production_found = False
                     MAX_INT_32,
                     TYPE_DICTIONARY[exc["type"]],
                     exc["amount"],
-                    exc.get("sigma", np.NaN),
+                    exc.get("loc", np.NaN),
+                    exc.get("scale", np.NaN),
+                    exc.get("shape", np.NaN),
                     exc.get("minimum", np.NaN),
                     exc.get("maximum", np.NaN),
                     exc["amount"] < 0

bw2data/io/import_ecospold.py

             data.update({
                 'uncertainty type': LognormalUncertainty.id,
                 'amount': float(mean),
-                'sigma': math.log(math.sqrt(float(sigma)))
-                })
+                'loc': np.log(np.abs(mean)),
+                'scale': math.log(math.sqrt(float(sigma))),
+                'negative': mean < 0,
+            })
             if data['sigma'] == 0:
                 # Bad ecoinvent data
                 data['uncertainty type'] = UndefinedUncertainty.id
+                data['loc'] = data['amount']
                 del data["sigma"]
         elif uncertainty == 2:
             # Normal
             data.update({
                 'uncertainty type': NormalUncertainty.id,
                 'amount': float(mean),
-                'sigma': float(sigma) / 2
-                })
+                'loc': float(mean),
+                'scale': float(sigma) / 2
+            })
         elif uncertainty == 3:
             # Triangular
             data.update({
                 'uncertainty type': TriangularUncertainty.id,
                 'minimum': float(min_),
                 'maximum': float(max_)
-                })
+            })
             # Sometimes this isn't included (though it SHOULD BE)
             if exc.get("mostLikelyValue"):
-                data['amount'] = float(exc.get("mostLikelyValue"))
+                mode = float(exc.get("mostLikelyValue"))
+                data['amount'] = data['loc'] = mode
             else:
-                data['amount'] = float(mean)
+                data['amount'] = data['loc'] = float(mean)
         elif uncertainty == 4:
             # Uniform
             data.update({
             # None
             data.update({
                 'uncertainty type': UndefinedUncertainty.id,
-                'amount': float(mean)
+                'amount': float(mean),
+                'loc': float(mean),
             })
         return data
 
 from . import config, reset_meta
 import codecs
 import hashlib
+import numpy as np
 import os
 import random
 import re
     import cStringIO as StringIO
 except ImportError:
     import StringIO
+try:
+    import stats_arrays as sa
+except ImportError:
+    import warnings
+    WARNING_TEXT = """
+
+It looks like you need to upgrade to the ``stats_arrays`` package. This is a new statistical toolkit that replaces the deprecated ``bw_stats_toolkit``. Read more at (https://bitbucket.org/cmutel/stats_arrays/).
+
+To do this, enter a Python interpreter, and run the following:
+
+    from bw2data.utils import convert_from_stats_toolkit
+    convert_from_stats_toolkit()
+
+Then leave the Python interpreter, and install the ``stats_arrays`` package, something like this:
+
+    pip install stats_arrays
+
+    """
+    warnings.warn(WARNING_TEXT)
+    sa = None
 
 # Maximum value for unsigned integer stored in 4 bytes
 MAX_INT_32 = 4294967295
     zf.close()
     memory_obj.seek(0)
     return memory_obj
+
+
+def convert_from_stats_toolkit():
+    """Convert all databases from ``bw_stats_toolkit`` to ``stats_arrays`` (https://bitbucket.org/cmutel/stats_arrays/)."""
+    def update_exchange(exc):
+        if not exc.get('uncertainty_type', None):
+            return exc
+        exc['scale'] = exc['sigma']
+        del exc['sigma']
+        exc['loc'] = exc['amount']
+        if exc['uncertainty_type'] == sa.LognormalUncertainty.id:
+            exc['negative'] = exc['amount'] < 0
+            exc['loc'] = np.log(np.abs(exc['amount']))
+        return exc
+
+    assert sa, "Must have `stats_arrays` package for this function"
+    from bw2data import Database, databases
+    print "Starting conversion"
+    for database in databases:
+        print "Working on %s" % database
+        db = Database(database)
+        data = db.load()
+        for key, value in data:
+            if 'exchanges' in value:
+                value['exchanges'] = [update_exchange(exchange
+                    ) for exchange in value['exchanges']]
+            data[key] = value
+        db.write(data)
+        db.process()
+    print "Conversion finished"
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.