Commits

Chris Mutel  committed 1a548de Merge

Merged functionality to use stats_arrays instead of bw-stats-toolkit

  • Participants
  • Parent commits 02b9bd8, 55f71d1

Comments (0)

Files changed (8)

File bw2data/database.py

 
     def process(self, version=None):
         """
-Process intermediate data from a Python dictionary to a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type. These structured arrays act as a standard data format for LCA and Monte Carlo calculations, and are the native data format for the Stats Arrays package.
+Process intermediate data from a Python dictionary to a `stats_arrays <https://pypi.python.org/pypi/stats_arrays/>`_ array, which is a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type.
 
 Processed arrays are saved in the ``processed`` directory.
 
-The structure for processed inventory databases is:
+The structure for processed inventory databases includes additional columns beyond the basic ``stats_arrays`` format:
 
 ================ ======== ===================================
 Column name      Type     Description
 ================ ======== ===================================
-uncertainty_type uint8    integer type defined in `stats_toolkit.uncertainty_choices`
+uncertainty_type uint8    integer type defined in `stats_arrays.uncertainty_choices`
 input            uint32   integer value from `Mapping`
 output           uint32   integer value from `Mapping`
 geo              uint32   integer value from `GeoMapping`
 row              uint32   column filled with `NaN` values, used for matrix construction
 col              uint32   column filled with `NaN` values, used for matrix construction
 type             uint8    integer type defined in `bw2data.utils.TYPE_DICTIONARY`
-amount           float32  location parameter, e.g. mean
-sigma            float32  shape parameter, e.g. std
+amount           float32  amount without uncertainty
+loc              float32  location parameter, e.g. mean
+scale            float32  scale parameter, e.g. standard deviation
+shape            float32  shape parameter
 minimum          float32  minimum bound
 maximum          float32  maximum bound
 negative         bool     `amount` < 0
             ('col', np.uint32),
             ('type', np.uint8),
             ('amount', np.float32),
-            ('sigma', np.float32),
+            ('loc', np.float32),
+            ('scale', np.float32),
+            ('shape', np.float32),
             ('minimum', np.float32),
             ('maximum', np.float32),
             ('negative', np.bool)
         ]
         arr = np.zeros((num_exchanges + len(data), ), dtype=dtype)
-        arr['minimum'] = arr['maximum'] = arr['sigma'] = np.NaN
         count = 0
         for key in sorted(data.keys(), key=lambda x: x[1]):
             production_found = False
                     MAX_INT_32,
                     TYPE_DICTIONARY[exc["type"]],
                     exc["amount"],
-                    exc.get("sigma", np.NaN),
+                    exc.get("loc", np.NaN),
+                    exc.get("scale", np.NaN),
+                    exc.get("shape", np.NaN),
                     exc.get("minimum", np.NaN),
                     exc.get("maximum", np.NaN),
                     exc["amount"] < 0

File bw2data/io/import_ecospold.py

 from ..utils import activity_hash
 from ..units import normalize_units
 from lxml import objectify
-from stats_toolkit.distributions import *
+try:
+    from stats_arrays.distributions import *
+except ImportError:
+    LognormalUncertainty = None
 import copy
 import math
 import numpy as np
             data.update({
                 'uncertainty type': LognormalUncertainty.id,
                 'amount': float(mean),
-                'sigma': math.log(math.sqrt(float(sigma)))
-                })
+                'loc': np.log(np.abs(mean)),
+                'scale': math.log(math.sqrt(float(sigma))),
+                'negative': mean < 0,
+            })
             if data['sigma'] == 0:
                 # Bad ecoinvent data
                 data['uncertainty type'] = UndefinedUncertainty.id
+                data['loc'] = data['amount']
                 del data["sigma"]
         elif uncertainty == 2:
             # Normal
             data.update({
                 'uncertainty type': NormalUncertainty.id,
                 'amount': float(mean),
-                'sigma': float(sigma) / 2
-                })
+                'loc': float(mean),
+                'scale': float(sigma) / 2
+            })
         elif uncertainty == 3:
             # Triangular
             data.update({
                 'uncertainty type': TriangularUncertainty.id,
                 'minimum': float(min_),
                 'maximum': float(max_)
-                })
+            })
             # Sometimes this isn't included (though it SHOULD BE)
             if exc.get("mostLikelyValue"):
-                data['amount'] = float(exc.get("mostLikelyValue"))
+                mode = float(exc.get("mostLikelyValue"))
+                data['amount'] = data['loc'] = mode
             else:
-                data['amount'] = float(mean)
+                data['amount'] = data['loc'] = float(mean)
         elif uncertainty == 4:
             # Uniform
             data.update({
             # None
             data.update({
                 'uncertainty type': UndefinedUncertainty.id,
-                'amount': float(mean)
+                'amount': float(mean),
+                'loc': float(mean),
             })
         return data
 
 
         """
 
+        if LognormalUncertainty is None:
+            print "``stats_array`` not installed!"
+            return
+
         self.log, self.logfile = get_io_logger("lci-import")
         self.new_activities = []
         self.new_biosphere = []

File bw2data/io/import_ecospold2.py

 # from ..logs import get_io_logger
 from ..units import normalize_units
 from lxml import objectify, etree
-from stats_toolkit.distributions import *
+from stats_arrays.distributions import *
 import os
 import progressbar
 import warnings

File bw2data/method.py

 
     def process(self):
         """
-Process intermediate data from a Python dictionary to a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type. These structured arrays act as a standard data format for LCA and Monte Carlo calculations, and are the native data format for the Stats Arrays package.
+Process intermediate data from a Python dictionary to a `stats_arrays <https://pypi.python.org/pypi/stats_arrays/>`_ array, which is a `NumPy <http://numpy.scipy.org/>`_ `Structured <http://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html#numpy.recarray>`_ `Array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_. A structured array (also called record array) is a heterogeneous array, where each column has a different label and data type.
 
 Processed arrays are saved in the ``processed`` directory.
 
 Although it is not standard to provide uncertainty distributions for impact assessment methods, the structured array includes uncertainty fields.
 
-The structure for processed inventory databases is:
+The structure for processed IA methods includes additional columns beyond the basic ``stats_arrays`` format:
 
 ================ ======== ===================================
 Column name      Type     Description
 ================ ======== ===================================
-uncertainty_type uint8    integer type defined in `stats_toolkit.uncertainty_choices`
+uncertainty_type uint8    integer type defined in `stats_arrays.uncertainty_choices`
 flow             uint32   integer value from `Mapping`
 index            uint32   column filled with `NaN` values, used for matrix construction
 geo              uint32   integer value from `GeoMapping`
 amount           float32  location parameter, e.g. mean
-sigma            float32  shape parameter, e.g. std
+loc              float32  location parameter, e.g. mean
+scale            float32  scale parameter, e.g. standard deviation
+shape            float32  shape parameter
 minimum          float32  minimum bound
 maximum          float32  maximum bound
 negative         bool     `amount` < 0
             ('index', np.uint32),
             ('geo', np.uint32),
             ('amount', np.float32),
-            ('sigma', np.float32),
+            ('loc', np.float32),
+            ('scale', np.float32),
+            ('shape', np.float32),
             ('minimum', np.float32),
             ('maximum', np.float32),
             ('negative', np.bool)
         ]
         arr = np.zeros((len(data), ), dtype=dtype)
-        arr['minimum'] = arr['maximum'] = arr['sigma'] = np.NaN
         for i, (key, value, geo) in enumerate(data):
             arr[i] = (
                 0,
                 MAX_INT_32,
                 geomapping[geo],
                 value,
+                value,
+                np.NaN,
                 np.NaN,
                 np.NaN,
                 np.NaN,

File bw2data/serialization.py

     def __len__(self):
         return len(self.data)
 
+    def __iter__(self):
+        return iter(self.data)
+
     def iteritems(self):
         return self.data.iteritems()
 

File bw2data/utils.py

 from . import config, reset_meta
 import codecs
 import hashlib
+import numpy as np
 import os
+import progressbar
 import random
 import re
 import requests
     import cStringIO as StringIO
 except ImportError:
     import StringIO
+try:
+    import stats_arrays as sa
+except ImportError:
+    import warnings
+    WARNING_TEXT = """
+
+It looks like you need to upgrade to the ``stats_arrays`` package. This is a new statistical toolkit that replaces the deprecated ``bw_stats_toolkit``. Read more at (https://bitbucket.org/cmutel/stats_arrays/).
+
+To do this, use `pip` (or whatever package manager you prefer) to install `stats_arrays`, e.g.:
+
+    pip install stats_arrays
+
+Then, enter a Python interpreter, and run the following:
+
+    from bw2data.utils import convert_from_stats_toolkit
+    convert_from_stats_toolkit()
+    """
+    warnings.warn(WARNING_TEXT)
+    sa = None
 
 # Maximum value for unsigned integer stored in 4 bytes
 MAX_INT_32 = 4294967295
     zf.close()
     memory_obj.seek(0)
     return memory_obj
+
+
+def convert_from_stats_toolkit():
+    """Convert all databases from ``bw_stats_toolkit`` to ``stats_arrays`` (https://bitbucket.org/cmutel/stats_arrays/)."""
+    def update_exchange(exc):
+        if exc.get('uncertainty type', None) is None:
+            return exc
+        if 'sigma' in exc:
+            exc['scale'] = exc['sigma']
+            del exc['sigma']
+        exc['loc'] = exc['amount']
+        if exc['uncertainty type'] == sa.LognormalUncertainty.id:
+            exc['negative'] = exc['amount'] < 0
+            exc['loc'] = np.log(np.abs(exc['amount']))
+        return exc
+
+    assert sa, "Must have `stats_arrays` package for this function"
+    from bw2data import Database, databases, Method, methods
+    print "Starting inventory conversion"
+    for database in ("ecoinvent 2.2", "biosphere"):
+        print "Working on %s" % database
+        db = Database(database)
+        print "\t... loading ..."
+        data = db.load()
+        print "\t... converting ..."
+        new_data = {}
+
+        for index, (key, value) in enumerate(data.iteritems()):
+            if 'exchanges' in value:
+                value['exchanges'] = [update_exchange(exchange
+                    ) for exchange in value['exchanges']]
+            new_data[key] = value
+
+        print "\t... writing ..."
+        db.write(new_data)
+        db.process()
+    print "Inventory conversion finished\nStarting IA conversion"
+
+    widgets = ['IA methods: ', progressbar.Percentage(), ' ',
+               progressbar.Bar(marker=progressbar.RotatingMarker()), ' ',
+               progressbar.ETA()]
+    pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(methods.list)
+                                   ).start()
+
+    for index, name in enumerate(methods):
+        method = Method(name)
+        method.process()
+        pbar.update(index)
+    pbar.finish()
+    print "Conversion finished"

File requirements.txt

 scipy
 voluptuous
 brightway2>=0.9.1
-bw-stats-toolkit>=0.7
+stats_arrays
     author="Chris Mutel",
     author_email="cmutel@gmail.com",
     license=open('LICENSE.txt').read(),
-    install_requires=["voluptuous", "progressbar", "numpy", "lxml", "scipy", "requests>=1.1.0", "brightway2"],
+    install_requires=["voluptuous", "progressbar", "numpy", "lxml", "scipy", "requests>=1.1.0", "brightway2", "stats_arrays"],
     url="https://bitbucket.org/cmutel/brightway2-data",
     long_description=open('README.rst').read(),
     classifiers=[