Commits

Miha Stajdohar committed bcd7e28

Supported Random fores model map.

Comments (0)

Files changed (1)

orangecontrib/modelmaps/modelmap.py

 
 .. autoclass:: Orange.modelmaps.BuildModelMap
    :members:
-   
+
 **************
 Help Functions
 **************
 .. autofunction:: get_feature_subsets
 
 """
-
 import bz2, itertools, math, random, os.path, time, uuid, re, sys
 import cPickle as pickle
 
 from Orange.classification.knn import kNNLearner
 from Orange.classification.tree import SimpleTreeLearner, TreeLearner
 
+
 MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM", "RF"]
 
+
 def distance_mi(m1, m2):
     """Return inverted normalized mutual information.
 
     1 - NMI(m1.instance_predictions, m2.instance_predictions)
+
     """
-
     classes1 = np.unique(m1.instance_predictions)
     classes2 = np.unique(m2.instance_predictions)
     m1_classes = [m1.instance_predictions == c for c in classes1]
 
     return 0 if h1 == 0 and h2 == 0 else 1. - 2 * mi / (h1 + h2)
 
+
 def distance_class(m1, m2):
     w = np.average(m1.instance_predictions != m2.instance_predictions)
     return 1 if math.isnan(w) else w
 
+
 def distance_manhattan(m1, m2):
     return np.sum(np.abs(m1.probabilities - m2.probabilities)) / 2. / len(m1.probabilities)
 
+
 def distance_euclidean(m1, m2):
     return np.sum(np.sqrt(np.sum((m1.probabilities - m2.probabilities)**2, axis=1))) / math.sqrt(2.) / len(m1.probabilities)
 
+
 def distance_rank(m1, m2):
     return 1 - abs(scipy.stats.spearmanr(m1.probabilities, m2.probabilities, axis=None)[0])
 
+
 def get_feature_subsets_scatterplot(domain, nsubsets):
     """Return attribute subsets for Scatter Plot."""
     attrs = []
 
     return attrs[:nsubsets]
 
-def get_feature_subsets(domain, nsubsets=None, min_features=None, max_features=None):
+
+def get_feature_subsets(domain, nsubsets=None, min_features=None, max_features=None, seed=None):
     """Return random attribute subsets.
-    
+
     :param domain: data set domain to extract features
     :type domain: :obj:`Orange.data.Domain`
-    
+
     :param nsubsets:  number of attribute subsets
     :type nsubsets: int
-    
+
     """
+    if seed:
+        random.seed(seed)
 
     def binomial(n, k):
         if n > k:
         return list(itertools.compress(combinations, selectors))
     #return list(itertools.compress(combinations, xrange(10)))
 
+
 def get_models_table():
     """Return an empty data table for model meta data."""
 
 
 
 def load(file_name):
-    """Load a model map. Read compressed tuple containing model similarity
-    matrix and data table.
+    """Load model map.
+
+    Read compressed tuple containing model similarity matrix and data table.
 
     """
-
     base, ext = os.path.splitext(file_name)
     file_name = base if ext.lower() == ".bz2" else file_name
 
     smx, table, data = pickle.load(bz2.BZ2File('%s.bz2' % file_name, "r"))
     return smx, table, data
 
+
 def save(file_name, smx, model_data, original_data):
-    """Save model map to disk. Model similarity matrix and data table tuple
-    is pickled and compressed as a bz2 archive.
+    """Save model map.
+
+    Model similarity matrix and data table tuple is pickled and compressed as
+    a bz2 archive.
 
     """
-
     if original_data is None or smx is None or model_data is None:
         raise AttributeError("Distance matrix, model meta-data table, and original data table must be given.")
 
 
     pickle.dump((smx, model_data, original_data), bz2.BZ2File('%s.bz2' % file_name, "w"), -1)
 
+
 class BuildModelMap(object):
 
-    def __init__(self, fname, folds=10, model_limit=500):
+    def __init__(self, fname, folds=10, model_limit=500, seed=42):
 
         self.model_limit = model_limit
         self.data_d = self._get_data(fname)
         #self.data_c = self._get_data(fname, continuize=True)
         self.data_d = data.filter.IsDefined(domain=self.data_d.domain)(self.data_d)
-
         self.folds = folds if len(self.data_d) < 2000 else 2
-
-        self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)
+        self.seed = seed
+        self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=seed)
 
     def _get_data(self, fname, continuize=False):
         """Return a data Table.
-           
+
         :param fname: data set file name
         :type fname: string
-        
+
         :param continuize:  if true, it tries to load a name-c.tab data table as Orange DomainContinuizer changes attribute names.
         :type continuize: bool
-        
+
         """
-
         if continuize:
             base, ext = os.path.splitext(fname)
             if base[-2:] == "-c":
     def data(self):
         return self.data_d
 
-    def build_model(self, learner, data):
+    def build_model(self, attributes, learner):
         """Build a classification meta-model.
-        
+
+        :param attributes: subset of attributes
+        :type attributes: list of strings
         :param learner: classification learner to wrap
         :type learner: :obj:`Orange.classification.Learner`
-        
-        :param data: data set
-        :type data: :obj:`Orange.data.Table`
-        
+
         """
+        attributes = list(attributes)
+        attributes.append(self.data().domain.class_var)
+        d = data.Domain(attributes, self.data().domain)
+        _data = data.Table(d, self.data())
 
         probabilities = []
         instance_predictions = []
         res = []
         # estimate class probabilities using CV
         for fold in range(self.folds):
-            learnset = data.selectref(self.indices, fold, negate=1)
-            testset = data.selectref(self.indices, fold, negate=0)
+            learnset = _data.selectref(self.indices, fold, negate=1)
+            testset = _data.selectref(self.indices, fold, negate=0)
             classifier = learner(learnset)
             tcn = 0
-            for i in range(len(data)):
+            for i in range(len(_data)):
                 if (self.indices[i] == fold):
                     ex = data.Instance(testset[tcn])
                     ex.setclass("?")
                         raise "Classifier %s returned unknown value" % (classifier.name)
 
                     probabilities.append(list(cr[1]))
-                    instance_predictions.append(cr[0])
-                    instance_classes.append(testset[tcn].get_class())
+                    instance_predictions.append(cr[0].value)
+                    instance_classes.append(testset[tcn].get_class().value)
                     tcn += 1
 
         return model.Model(type(learner).__name__,
-                     learner(data),
+                     learner(_data),
                      np.array(probabilities),
-                     {val: i for i, val in enumerate(self.data_d.domain.class_var.values)},
-                     [x.name for x in data.domain.attributes],
-                     instance_predictions,
-                     instance_classes)
+                     {val: i for i, val in enumerate(self.data().domain.class_var.values)},
+                     [x.name for x in _data.domain.attributes],
+                     np.array(instance_predictions),
+                     np.array(instance_classes))
 
     def build_projection_model(self, attributes, visualizationMethod):
         """Build a projection meta-model."""
-
         method = "?"
         if visualizationMethod == vr.SCATTERPLOT:
             graph = data.preprocess.scaling.ScaleScatterPlotData()
                 break
 
         return model.Model(method,
-                     None, #learner(table),
+                     learner(table),
                      np.array(probabilities),
                      {val: i for i, val in enumerate(self.data_d.domain.class_var.values)},
                      attributes,
                      XAnchors=XAnchors,
                      YAnchors=YAnchors)
 
+    def build_rf_models(self, trees=50, max_depth=4, min_instances=5):
+        """Build Random forest and return tree models.
 
-    def build_rf_models(self, trees=50, max_depth=2, three_folds=False):
+        :param trees: number of trees in the forest
+        :type trees: int
+        :param max_depth: maximal tree depth
+        :type max_depth: int
+        :param min_instances: nodes with less than min_instances instances are not split further
+        :type min_instances: int
+
+        """
+        indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=self.seed)(self.data())
+        train = self.data().select(indices, 0)
+        test = self.data().select(indices, 1)
+        models = []
+
+        rf_learner = ensemble.forest.RandomForestLearner(trees=trees*2, base_learner=TreeLearner(max_depth=max_depth, min_instances=min_instances), name="RF: %d trees; max depth: None; min instances: %d" % (trees, min_instances))
+        rf_classifier = rf_learner(train)
+
+        def get_features(cls, domain):
+            def tree_attr(node):
+                if not node or node.branch_selector is None:
+                    return []
+
+                size = [node.branch_selector.class_var.name]
+                if node.branch_selector:
+                    for branch in node.branches:
+                            size += tree_attr(branch)
+                return size
+
+            return list(set(tree_attr(cls.tree)))
+
+        for c in rf_classifier.classifiers:
+            # compute model performance on test data
+            probabilities, instance_predictions, instance_classes = [], [], []
+            for ex in test:
+                ex = data.Instance(ex)
+                instance_classes.append(ex.get_class().value)
+                ex.setclass("?")
+                cl, prob = c(ex, c.GetBoth)
+                if cl.isSpecial():
+                    raise "Classifier %s returned unknown value" % c.name
+                probabilities.append(list(prob))
+                instance_predictions.append(cl.value)
+
+            m = model.Model("RF",
+                c,
+                np.array(probabilities),
+                {val: k for k, val in enumerate(test.domain.class_var.values)},
+                get_features(c, test.domain),
+                np.array(instance_predictions),
+                np.array(instance_classes)
+            )
+            # save model performance on test data
+            instance = m.get_instance(get_models_table().domain)
+
+            # compute model predictions for model symilarity on all data
+            probabilities, instance_predictions, instance_classes = [], [], []
+            for ex in self.data():
+                ex = data.Instance(ex)
+                instance_classes.append(ex.get_class().value)
+                ex.setclass("?")
+                cl, prob = c(ex, c.GetBoth)
+                if cl.isSpecial():
+                    raise "Classifier %s returned unknown value" % c.name
+                probabilities.append(list(prob))
+                instance_predictions.append(cl.value)
+
+            m = model.Model("RF",
+                c,
+                np.array(probabilities),
+                {val: k for k, val in enumerate(test.domain.class_var.values)},
+                get_features(c, test.domain),
+                np.array(instance_predictions),
+                np.array(instance_classes)
+            )
+
+            # set model performance on test data
+            m.set_instance(instance)
+            models.append(m)
+
+        return models
+
+    def build_rf(self, trees=50, max_depth=2, three_folds=False):
         if three_folds:
-            indices = data.sample.SubsetIndicesCV(folds=3, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(self.data_d)
+            indices = data.sample.SubsetIndicesCV(folds=3, stratified=data.sample.SubsetIndices.Stratified, randseed=self.seed)(self.data_d)
         else:
-            indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(self.data_d)
+            indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=self.seed)(self.data_d)
 
         rv = []
 
 
         return rv[0] if len(rv) == 1 else rv
 
-
     def _print_time(self, time_start, iter, numiter):
         time_elapsed = time.time() - time_start
         time_total = time_elapsed / iter * (numiter * (numiter-1) / 2.)