Commits

Chris Mutel  committed 8848cdc

Split AMOEBA calculations into separate library

  • Participants
  • Parent commits 754ffdc

Comments (0)

Files changed (5)

File lcia_autocorrelation/amoeba.py

-# -*- coding: utf-8 -*
-from __future__ import division
-from numpy import *
-from math import sqrt
-try:
-    import progressbar as pb
-except ImportError:
-    pb = None
-import time
-from tier_dictionary import TierDictionary
-from shapefile import ShapefileWriter, ShapefileReader
-from utils import buffer_gdal_geometry
-
-
-class NoNeighborsError(StandardError):
-    pass
-
-
-class AMOEBA(object):
-    """
-Calculates a spatial weights matrix using AMOEBA algorithm.
-
-The basic algorithm is::
-
-    for each geographic unit:
-        neighbors = [original unit]
-        calculate g_star of neigbors
-        while True
-            find all neighbors, excluding excluded neighbors
-            find set of new neighbors that maximizes g_star
-            if new g_star < old g_star
-                break
-            neighbors += new neighbors
-            excluded += rejected neighbors
-
-.. rubric:: Inputs
-
-* obj: A shapefile or Django queryset of geography objects and weights.
-
-Aldstadt, J. & Getis, A. (2006). Using AMOEBA to Create a Spatial Weights Matrix and Identify Spatial Clusters. Geographical Analysis, 38(4), 327--343. 
-    """
-    def __init__(self, ds, export_shp=False, progressbar=False, 
-            callback=None):
-        self.ds = ds
-        self.export_shp = export_shp
-        self.pb = progressbar and pb
-        self.callback = callback
-        self.weights_dict = self.get_weights_dict()
-        self.geo_ids = tuple(self.weights_dict.keys())
-        self.number = float(len(self.geo_ids))
-        values = array(self.weights_dict.values())
-        self.average = float(average(values))
-        self.pseudo_std = float(sqrt(
-            (values**2).sum() / self.number - self.average**2)
-            )
-
-    def go(self, objs=None):
-        # Used for testing, with new QS
-        objs = objs or self.ds
-
-        results = {}
-        try:
-            count = objs.count()
-        except TypeError:
-            count = len(objs)
-
-        if self.pb:
-            widgets = ['Weightings: ', pb.Percentage(), ' ', 
-                pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()]
-            pbar = pb.ProgressBar(widgets=widgets, maxval=count).start()
-
-        start_time = time.time()
-
-        for index, obj in enumerate(objs):
-            if not index % 5000 and index > 0 and not self.pb:
-                print "%s/%s" % (index, len(objs))
-            if not index % 50 and index > 0 and self.callback:
-                # Send a progress message
-                current_time = time.time()
-                elapsed = current_time - start_time
-                eta = elapsed * (1/(index/count) - 1)
-                self.callback("completed %s/%s" % (index, count), eta=eta, 
-                    count=index, total=count, elapsed=elapsed)
-
-            oid = self.get_oid(obj)
-            results[oid] = self.iterate(obj)
-
-            if self.pb:
-                pbar.update(index)
-
-        if self.pb:
-            pbar.finish()
-
-        return results
-
-    def g_star(self, values_sum, values_num):
-        """
-Calculate g-star statistic for spatial unit in relation to neighbors. 
-
-The Gi* star statistic can be considered a standard normal variate.  So, the the cumulative probability of G*i(k_max) would be the cdf value for the standard normal distribution of the value G*i(k_max).  This value could be close to 1, but could also be much closer to zero in some cases.  So, using equation 2 of the AMOEBA paper, elements in the k_max level of contiguity would have zero weights with unit i.  These units are at the edge of the ecotope centered on i.  The units closer to i would have positive weights that increase as proximity to i increases. The value Gi*(0) is the Gi* value for the unit itself.
-        """
-        return (values_sum - self.average * values_num) / (
-            self.pseudo_std * sqrt(
-                (self.number - values_num) * values_num / (self.number - 1)
-                )
-            )
-
-    def get_trend(self, obj, tiers):
-        """Do a first round of expansion, and determine the trend direction"""
-        neighbors = self.expand(tiers, set([0]))
-        if not neighbors:
-            raise NoNeighborsError
-
-        # Create array with col 1: geo ids, col 2: weights
-        data = array([self.weights_dict[key] for key in neighbors])
-        # Sort by weight ascending
-        data = data[argsort(data)]
-
-        orig_value = float(sum(tiers.all_weights()))
-        if tiers.score < 0:
-            operator = max
-        else:
-            operator = min
-
-        up_score = operator([self.g_star(float(data[:index].sum()) + \
-            orig_value, index + 1) for index in range(1, data.shape[0] + 1)])
-        down_score = operator([self.g_star(float(data[::-1][:index].sum()) + \
-            orig_value, index + 1) for index in range(1, data.shape[0] + 1)])
-        if tiers.score < 0:
-            trend = "downwards" if (down_score < up_score) else "upwards"
-        else:
-            trend = "downwards" if (down_score > up_score) else "upwards"
-        return trend
-
-    def iterate(self, obj):
-        oid = self.get_oid(obj)
-        weight = self.weights_dict[oid]
-        tiers = TierDictionary(oid, weight, self.g_star(weight, 1))
-        excluded = set([0])
-
-        try:
-            trend = self.get_trend(obj, tiers)
-        except NoNeighborsError:
-            return tiers.normalized
-
-        while True:
-            neighbors = self.expand(tiers, excluded)
-            if not neighbors:
-                # No new neighbors found; stop iteration
-                break
-
-            # Create array with col 1: geo ids, col 2: weights
-            data = hstack((array(neighbors).reshape((-1,1)), array([ 
-                self.weights_dict[key] for key in neighbors]).reshape((-1,1) 
-                )))
-            # Sort by weight
-            if trend == "upwards":
-                data = data[argsort(data[:,1])]
-            else:
-                data = data[argsort(data[:,1])[::-1]]
-
-            already_values = tiers.all_weights()
-            already_number = float(already_values.shape[0])
-            already_sum = float(already_values.sum())
-            base_score = loop_score = tiers.score
-            new_neighbors = set()
-
-            for index in range(1, data.shape[0]):
-                values = data[:index, 1]
-
-                new_score = self.g_star(float(values.sum()) + already_sum,
-                    float(values.shape[0]) + already_number)
-
-                if not (new_score > loop_score > 0 or new_score < \
-                        loop_score < 0):
-                    break
-                else:
-                    new_neighbors = set([int(x) for x in data[:index,0]])
-                    loop_score = new_score
-
-            if loop_score == base_score:
-                # No better set of neighbors was found
-                break
-            else:
-                # Add neighbors and set new high score
-                excluded = excluded.union(set(neighbors).difference( 
-                    set(new_neighbors)))
-                tiers.add(new_neighbors, self.weights_dict, loop_score)
-
-        if self.export_shp:
-            self.write_to_shapefile(tiers, obj)
-
-        return tiers.normalized
-
-
-class ShapefileAMOEBA(AMOEBA):
-    """AMOEBA algorithm adapted to the objects defined by the Python GDAL shapefile interface."""
-    def expand(self, tiers, excluded):
-        if tiers.level > 10:
-            # One could continue to expand, but there is not really any 
-            # point; the spatial weights are < 1e-6 anyway.
-            return []
-        return list(
-            reduce(
-                set.union, 
-                [set(self.ds.intersects( 
-                    buffer_gdal_geometry(
-                        self.ds.layer[index].geom, 0.00001), 
-                    set.union(excluded, set([index]), tiers.all_geo)
-                )) for index in tiers.geo[tiers.level]]
-            ))
-
-    def get_oid(self, o):
-        return o.fid
-
-    def write_to_shapefile(self, tiers, obj):
-        """Write tiers to shapefile for inspection."""
-        filename = self.ds.filepath.split("/")[-1].split(".")[0]
-        sf = ShapefileWriter('amoeba-output-%s-%s' % (filename, 
-            self.get_oid(obj)), self.ds.layer[0].geom, "amoeba", id=int, 
-            weight=float, cf=float)
-
-        for key, weight in tiers.normalized.iteritems():
-            obj = self.ds.layer[key]
-            sf.add_record(obj.geom, id=key, weight=weight, 
-                cf=obj.get(self.ds.amount_field))
-
-        sf.close()
-
-    def get_weights_dict(self):
-        return self.ds.mapping_dict()
-

File lcia_autocorrelation/autocorrelate.py

-import sys, os
-from ac_shapefile import AutocorrelationShapefile
-
-if len(sys.argv) != 2:
-    print "Can't understand arguments - please give filename e.g. \n\tpython autocorrelate.py path/to/file/filename.shp"
-    sys.exit()
-try:
-    os.stat(sys.argv[1])
-except OSError:
-    print "Can't find or access file %s" % sys.argv[1]
-    sys.exit()
-
-AutocorrelationShapefile(filename).global_autocorrelation()

File lcia_autocorrelation/shapefile.py

-# -*- coding: utf-8 -*
-import os
-from rtree import index
-from osgeo import ogr, osr
-from django.contrib.gis.gdal import DataSource
-
-class ShapefileReader(object):
-    """Wrapper around a shapefile to provide a nicer interface."""
-    def __init__(self, filepath, layer=0, amount_field=None):
-        self.filepath = filepath
-        self.datasource = DataSource(self.filepath)
-        self.layer = self.datasource[layer]
-        self.amount_field = amount_field or self.layer.fields[0]
-
-    def create_spatial_index(self):
-        """Create and populate r-tree spatial index."""
-        self.index = index.Index()
-        for feat in self.layer:
-            self.index.add(feat.fid, feat.geom.envelope.tuple)
-
-    def intersects(self, geom, excluded=None):
-        """Find features in the shapefile that intersect the given geom."""
-        if not excluded:
-            excluded = ()
-        in_index = self.index.intersection(geom.envelope.tuple)
-        return [i for i in in_index if self.layer[i 
-            ].geom.intersects(geom) and i not in excluded]
-
-    def intersects_multipolygon(self, mp):
-        """Iterate over all polygons in a multipolygon"""
-        return set.union(*[set(self.intersects(part)) for part in mp])
-
-    def count(self):
-        """For Django queryset compatibility."""
-        return self.__len__()
-
-    def mapping_dict(self):
-        """Create a dictionary mapping feature indices to amounts/weights"""
-        self.mapping = dict([(f.fid, float(f.get(self.amount_field))) for f \
-            in self.layer])
-        return self.mapping
-
-    def __len__(self):
-        return len(self.layer)
-
-    def __iter__(self):
-        return iter(self.layer)
-
-
-class ShapefileWriter(object):
-    def __init__(self, filename, geo, layer="geom", geom_type=None, **kwargs):
-        self.filename = filename + ".shp"
-        self.fields = {}
-        self.create_file()
-        # Infer geometry type from example geo
-        if hasattr(geo, "geom_type"):
-            geom_type = getattr(ogr, "wkb%s" % geo.geom_type)
-        elif geom_type:
-            pass
-        else:
-            raise ValueError, "Couldn't determine geometry type"
-        self.layer = self.ds.CreateLayer(layer, geom_type=geom_type)
-        for key, value in kwargs.iteritems():
-            self.create_field(key, value)
-        self.sr = osr.SpatialReference()
-        self.sr.ImportFromWkt(geo.srs.wkt)
-        self.feature_def = self.layer.GetLayerDefn()
-
-    def create_file(self):
-        driver = ogr.GetDriverByName('ESRI Shapefile')
-        if os.path.exists(self.filename):
-            driver.DeleteDataSource(self.filename)
-        self.ds = driver.CreateDataSource(self.filename)
-
-    def create_field(self, name, kind):
-        assert name not in self.fields
-        if kind == int:
-            self.fields[name] = ogr.FieldDefn(name, ogr.OFTInteger)
-        elif kind == float:
-            self.fields[name] = ogr.FieldDefn(name, ogr.OFTReal)
-            self.fields[name].SetWidth(14)
-            self.fields[name].SetPrecision(6)
-        elif kind == str:
-            self.fields[name] = ogr.FieldDefn(name, ogr.OFTString)
-        self.layer.CreateField(self.fields[name])
-
-    def add_record(self, obj, **kwargs):
-        feature = ogr.Feature(self.feature_def)
-        ogr_geo = ogr.CreateGeometryFromWkt(obj.wkt)
-        ogr_geo.AssignSpatialReference(self.sr)
-        feature.SetGeometry(ogr_geo)
-        for key, value in kwargs.iteritems():
-            feature.SetField(key, value)
-        self.layer.CreateFeature(feature)
-        feature.Destroy()
-
-    def close(self):
-        # Flush to file
-        self.ds.Destroy()
-

File lcia_autocorrelation/tier_dictionary.py

-# -*- coding: utf-8 -*
-from __future__ import division
-from numpy import array
-from scipy.stats import norm
-from utils import flatten
-
-class TierDictionary(object):
-    """Data structure for storing AMOEBA ecotopes; keeps level and objects."""
-    def __init__(self, obj_id, obj_weight, score):
-        self.geo = {0: set([obj_id])}
-        self.all_geo = set([obj_id])
-        self.weights = {0: [obj_weight]}
-        self.scores = {0: score}
-        self.level = 0
-
-    def __repr__(self):
-        return "TierDictionary with levels/geos:\n%s" % self.geo
-
-    def all_weights(self, max_level=1e12):
-        return array(flatten([values for key, values in \
-            self.weights.iteritems() if key <= max_level]))
-
-    def add(self, objs, mapping, score):
-        if set(objs).intersection(self.all_geo):
-            print set(objs).intersection(self.all_geo)
-            raise ValueError
-        self.level += 1
-        self.geo[self.level] = set(objs)
-        self.all_geo = self.all_geo.union(set(objs))
-        self.weights[self.level] = [mapping[key] for key in objs]
-        self.scores[self.level] = score
-
-    @property
-    def score(self):
-        return self.scores[self.level]
-
-    @property
-    def normalized(self):
-        """Return a dictionary with geo IDs as keys, and weights as values"""
-        k_max = self.level
-        if k_max == 0:
-            return {}
-        elif k_max == 1:
-            # Only 1 for neighbors, not original element
-            num_values = len(self.geo[1])
-            return dict([(x,1/num_values) for x in self.geo[1]])
-        else:
-            res = {}
-            denominator = norm.cdf(self.score) - norm.cdf(self.scores[0])
-            for level in xrange(1, self.level + 1):
-                numerator = norm.cdf(self.score) - norm.cdf(self.scores[ 
-                    level])
-                value = float(numerator / denominator)
-                if value > 0:
-                    res.update(dict([(key, value) for key in self.geo[ 
-                        level]]))
-            return res
-

File lcia_autocorrelation/utils.py

-from django.contrib.gis.geos import fromstr
-from django.contrib.gis.gdal import OGRGeometry
-
-def buffer_gdal_geometry(geom, buff):
-    return OGRGeometry(fromstr(geom.wkt).buffer(0.2).wkt)
-
-def flatten(x):
-    """Flattened a set of iterables into a single list.
-    
-    From http://kogs-www.informatik.uni-hamburg.de/~meine/python_tricks"""
-    result = []
-    for el in x:
-        if hasattr(el, '__iter__') and not isinstance(el, basestring):
-            result.extend(flatten(el))
-        else:
-            result.append(el)
-    return result