Anonymous avatar Anonymous committed 11955f5

Rewrote ICV, added Stacking, and a memory efficient option to compute BAGV.

Comments (0)

Files changed (1)

orangecontrib/reliability/__init__.py

 from collections import defaultdict
 from itertools import izip
 
-# Labels and final variables
-labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
-
-"""
-# All the estimators calculation constants
-DO_SA = 0
-DO_BAGV = 1
-DO_CNK = 2
-DO_LCV = 3
-DO_BVCK = 4
-DO_MAHAL = 5
-"""
-
 # All the estimator method constants
 SAVAR_ABSOLUTE = 0
 SABIAS_SIGNED = 1
 MAHAL_TO_CENTER_ABSOLUTE = 13
 DENS_ABSOLUTE = 14
 ERR_ABSOLUTE = 15
+STACKING = 101
 
 # Type of estimator constant
 SIGNED = 0
 # Names of all the estimator methods
 METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
                3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
-               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
+               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
                9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
-               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error"}
-
-select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
-select_with_repeat.random_generator = Orange.misc.Random()
+               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
+               101: "Stacking" }
 
 def get_reliability_estimation_list(res, i):
-    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
+    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
+        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
+        res.results[0].probabilities[0].reliability_estimate[i].method
 
 def get_prediction_error_list(res):
     return [result.actual_class - result.classes[0] for result in res.results]
         y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
         return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
 
-    
 
 class BaggingVariance:
     """
     measure. Note that in this case a greater value implies a better
     prediction.
     
+    This reliability measure can run out of memory fast if individual classifiers
+    use a lot of memory, as it build m of them, thereby using :math:`m` times memory
+    for a single classifier. If instances for measuring predictions
+    are given as a parameter, this class can only compute their reliability,
+    which allows less memory use. 
+
     """
-    def __init__(self, m=50, name="bv"):
+    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
+        """
+        for_instances: 
+        """
         self.m = m
         self.name = name
+        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
+        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
+        self.for_instances = for_instances
 
     def __call__(self, instances, learner):
         classifiers = []
         else:
             classifier = None
 
+        for_inst_class = defaultdict(list)
+        this_iteration = None
+        
+        if self.for_instances:
+            his = map(_hashable_instance, self.for_instances)
+
         # Create bagged classifiers using sampling with replacement
-        for _ in xrange(self.m):
-            selection = select_with_repeat(len(instances))
+        for i in xrange(self.m):
+            this_iteration = set()
+            selection = self.select_with_repeat(len(instances))
             data = instances.select(selection)
-            classifiers.append(learner(data))
-        return BaggingVarianceClassifier(classifiers, classifier)
+            cl = learner(data)
+            if cl:
+                if self.for_instances: # predict reliability for testing instances and throw cl away
+                    for instance, hi in zip(self.for_instances, his):
+                        if hi not in this_iteration:
+                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
+                            this_iteration.add(hi)
+                else:
+                    classifiers.append(cl)
+
+        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
 
 class BaggingVarianceClassifier:
-    def __init__(self, classifiers, classifier=None):
+    def __init__(self, classifiers, classifier=None, for_inst_class=None):
         self.classifiers = classifiers
         self.classifier = classifier
+        self.for_inst_class = for_inst_class
 
     def __call__(self, instance, *args):
         BAGV = 0
 
         # Calculate the bagging variance
-        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
-            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
-        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
-            estimate = self.classifier(instance, Orange.core.GetProbabilities)
-            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
+        if self.for_inst_class:
+            bagged_values = self.for_inst_class[_hashable_instance(instance)]
+        else:
+            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
+
         k = sum(bagged_values) / len(bagged_values)
 
         BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
 
         return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
 
+def _hashable_instance(instance):
+    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
+
+def _bagged_value(instance, c, classifier):
+    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
+        return c(instance, Orange.core.GetValue).value
+    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
+        estimate = classifier(instance, Orange.core.GetProbabilities)
+        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
+
+
 class LocalCrossValidation:
     """
 
     prediction error.
     
     """
-    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours(), name="bvck"):
+    def __init__(self, bagv=None, cnk=None, name="bvck"):
+        if bagv is None:
+            bagv = BaggingVariance()
+        if cnk is None:
+            cnk = CNeighbours()
         self.bagv = bagv
         self.cnk = cnk
         self.name = "bvck"
 
         return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
 
+class Stacking:
+
+    def __init__(self, stack_learner, estimators, folds=10, save_data=False):
+        self.stack_learner = stack_learner
+        self.estimators = estimators
+        self.folds = folds
+        self.save_data = save_data
+    
+    def __call__(self, data, learner):
+
+        newfeatures = None
+        
+        if self.folds > 1:
+
+            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
+            data_cv = [ None ] * len(data)
+            for f in set(cvi): #for each fold
+                learn = data.select(cvi, f, negate=True)
+                test = data.select(cvi, f)
+
+                #learn reliability estimates for the learning set
+                lf = Learner(learner, estimators=self.estimators)(learn)
+                
+                #pos is used to retain the order of instances
+                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
+                    pred = lf(ex, Orange.core.GetBoth)
+                    re = pred[1].reliability_estimate
+                    names = [ e.method_name for e in re ]
+                    assert newfeatures is None or names == newfeatures
+                    newfeatures = names
+                    estimates = [ abs(e.estimate) for e in re ]
+                    error = ex[-1].value - pred[0].value
+                    data_cv[pos] = estimates + [ abs(error) ]
+
+        else:
+ 
+            #use half of the data to learn reliability estimates
+            #and the other half for induction of a stacking classifier
+            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
+            data_cv = []
+
+            learn = data.select(cvi, 0, negate=True)
+            test = data.select(cvi, 0)
+
+            #learn reliability estimates for the learning set
+            lf = Learner(learner, estimators=self.estimators)(learn)
+            
+            for ex in test:
+                pred = lf(ex, Orange.core.GetBoth)
+                re = pred[1].reliability_estimate
+                names = [ e.method_name for e in re ]
+                assert newfeatures is None or names == newfeatures
+                newfeatures = names
+                estimates = [ abs(e.estimate) for e in re ]
+                error = ex[-1].value - pred[0].value
+                data_cv.append(estimates + [ abs(error) ])
+
+            print "DCV", len(data_cv)
+
+        lf = None
+
+        #induce the classifier on cross-validated reliability estimates
+        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
+        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
+        classifier_data = Orange.data.Table(newdomain, data_cv)
+        stack_classifier = self.stack_learner(classifier_data)
+
+        #induce reliability estimates on the whole data set
+        lf = Learner(learner, estimators=self.estimators)(data)
+
+        if self.save_data:
+            self.classifier_data = classifier_data
+
+        return StackingClassifier(stack_classifier, lf, newdomain)
+
+
+class StackingClassifier:
+
+    def __init__(self, stacking_classifier, reliability_classifier, domain):
+        self.stacking_classifier = stacking_classifier
+        print self.stacking_classifier
+        self.domain = domain
+        self.reliability_classifier = reliability_classifier
+
+    def convert(self, instance):
+        """ Return example in the space of reliability estimates. """
+        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
+        #take absolute values for all
+        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
+        tex =  Orange.data.Instance(self.domain, tex)
+        return tex
+
+    def __call__(self, instance, *args):
+        tex = self.convert(instance)
+        r = self.stacking_classifier(tex)
+        r = float(r)
+        r = max(0., r)
+        return [ Estimate(r, ABSOLUTE, STACKING) ]
+
+class ICV:
+    """ Perform internal cross validation (as in Automatic selection of
+     reliability estimates for individual regression predictions,
+    Zoran Bosnic, 2010) and return id of the method
+    that scored best on this data.
+
+
+    """
+  
+    def __init__(self, estimators, folds=10):
+        self.estimators = estimators
+        self.folds = folds
+    
+    def __call__(self, data, learner):
+
+        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
+        sum_of_rs = defaultdict(float)
+
+        elearner = Learner(learner, estimators=self.estimators)
+
+        #average correlations from each fold
+        for f in set(cvi):
+            learn = data.select(cvi, f, negate=True)
+            test = data.select(cvi, f)
+
+            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
+            results = get_pearson_r(res)
+            for r, p, sa, method in results:
+                sum_of_rs[(method, sa)] += r 
+
+        sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
+        chosen = sum_of_rs[0][0]
+        print "chosen", chosen
+        print sum_of_rs
+
+        lf = elearner(data)
+        return ICVClassifier(chosen, lf)
+
+
+class ICVClassifier:
+
+    def __init__(self, chosen, reliability_classifier):
+        self.chosen = chosen
+        self.reliability_classifier = reliability_classifier
+
+    def __call__(self, instance, *args):
+        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
+        for e in re:
+            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
+                r = e.estimate
+
+        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
+
 class Learner:
     """
     Reliability estimation wrapper around a learner we want to test.
 #            raise Exception("This method only works on data with continuous class.")
 
         return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
-
-    def internal_cross_validation(self, instances, folds=10):
-        """ Perform the internal cross validation for getting the best
-        reliability estimate. It uses the reliability estimators defined in
-        estimators attribute.
-
-        Returns the id of the method that scored the best.
-
-        :param instances: Data instances to use for ICV.
-        :type instances: :class:`Orange.data.Table`
-        :param folds: number of folds for ICV.
-        :type folds: int
-        :rtype: int
-
-        """
-        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
-        results = get_pearson_r(res)
-        sorted_results = sorted(results)
-        return sorted_results[-1][3]
-
-    def internal_cross_validation_testing(self, instances, folds=10):
-        """ Perform internal cross validation (as in Automatic selection of
-        reliability estimates for individual regression predictions,
-        Zoran Bosnic, 2010) and return id of the method
-        that scored best on this data.
-
-        :param instances: Data instances to use for ICV.
-        :type instances: :class:`Orange.data.Table`
-        :param folds: number of folds for ICV.
-        :type folds: int
-        :rtype: int
-
-        """
-        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
-
-        list_of_rs = []
-
-        sum_of_rs = defaultdict(float)
-
-        for fold in xrange(folds):
-            data = instances.select(cv_indices, fold)
-            if len(data) < 10:
-                res = Orange.evaluation.testing.leave_one_out([self], data)
-            else:
-                res = Orange.evaluation.testing.cross_validation([self], data)
-            results = get_pearson_r(res)
-            for r, _, _, method in results:
-                sum_of_rs[method] += r
-        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
-        return sorted_sum_of_rs[0][0]
-
-    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
-
+ 
 class Classifier:
     """
     A reliability estimation wrapper for classifiers.
 def acc_rel_correlation(method, data, learner):
     import scipy.stats
     rels, acc = get_acc_rel(method, data, learner)
-    return scipy.stats.spearmanr(acc, rels)[0]
+    return scipy.stats.spearmanr(acc, rels)[0]
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.