Commits

Miha Stajdohar  committed bf13c85

Fixed bugs, added readthedocs requirememnts.

  • Participants
  • Parent commits a818312

Comments (0)

Files changed (6)

File _modelmaps/modelmap.py

 
 from Orange import data, distance, ensemble, feature, misc, projection
 from Orange.classification.knn import kNNLearner
-from Orange.classification.tree import SimpleTreeLearner
+from Orange.classification.tree import SimpleTreeLearner, TreeLearner
 
 MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM", "RF"]
 
                      YAnchors=YAnchors)
 
 
-    def build_rf_models(self, trees=50, max_depth=2):
-        indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(self.data_d)
-        train = self.data_d.select(indices, 0)
-        test = self.data_d.select(indices, 1)
+    def build_rf_models(self, trees=50, max_depth=2, three_folds=False):
+        if three_folds:
+            indices = data.sample.SubsetIndicesCV(folds=3, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(self.data_d)
+        else:
+            indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(self.data_d)
 
-        class SimpleTreeLearnerSetProb():
-            """
-            Orange.classification.tree.SimpleTreeLearner which sets the skip_prob
-            so that on average a square root of the attributes will be
-            randomly choosen for each split.
-            """
-            def __init__(self, wrapped):
-                self.wrapped = wrapped
+        rv = []
 
-            def __call__(self, examples, weight=0):
-                self.wrapped.skip_prob = 1-len(examples.domain.attributes)**0.5/len(examples.domain.attributes)
-                return self.wrapped(examples)
+        for i in range(3 if three_folds else 1):
+            train = self.data_d.select(indices, 0)
 
-        min_instances = 5
-        # uses gain ratio
-        #tree = SimpleTreeLearnerSetProb(SimpleTreeLearner(max_depth=max_depth, min_instances=min_instances))
-        #rf_learner = ensemble.forest.RandomForestLearner(learner=tree, trees=trees, name="RF: %d trees; max depth: %d; min instances: %d" % (trees, max_depth, min_instances))
-        rf_learner = ensemble.forest.RandomForestLearner(trees=trees, name="RF: %d trees; max depth: None; min instances: %d" % (trees, min_instances))
-        rf_classifier = rf_learner(train)
+            min_instances = 5
+            rf_learner = ensemble.forest.RandomForestLearner(trees=trees*2, base_learner=TreeLearner(), name="RF: %d trees; max depth: None; min instances: %d" % (trees, min_instances))
+            rf_classifier = rf_learner(train)
 
-        def get_features(cls, domain):
-            features = re.findall('{ [01] \d+ (\d+)', pickle.dumps(cls))
-            return [domain[i].name for i in map(int, features)]
+            remaining_folds = range(3)
+            remaining_folds.remove(i)
 
-        models = []
-        for c in rf_classifier.classifiers:
-            probabilities = []
-            instance_predictions = []
-            instance_classes = []
-            for ex in test:
-                ex = data.Instance(ex)
-                instance_classes.append(ex.get_class().value)
-                ex.setclass("?")
-                cl, prob = c(ex, c.GetBoth)
-                if cl.isSpecial():
-                    raise "Classifier %s returned unknown value" % c.name
-                probabilities.append(list(prob))
-                instance_predictions.append(cl.value)
+            for j in range(2 if three_folds else 1):
 
-            models.append(Model("RF",
-                                c,
-                                np.array(probabilities),
-                                {val: i for i, val in enumerate(test.domain.class_var.values)},
-                                get_features(c, test.domain),
-                                np.array(instance_predictions),
-                                np.array(instance_classes),
-                                XAnchors=None,
-                                YAnchors=None))
+                test = self.data_d.select(indices, remaining_folds[j])
 
-        return models, rf_classifier, test
+                def get_features(cls, domain):
+                    #features = re.findall('{ [01] \d+ (\d+)', pickle.dumps(cls))
+                    #return [domain[i].name for i in map(int, features)]
+                    def tree_attr(node):
+                        if not node or node.branch_selector is None:
+                            return []
+
+                        size = [node.branch_selector.class_var.name]
+                        if node.branch_selector:
+                            for branch in node.branches:
+                                    size += tree_attr(branch)
+                        return size
+
+                    return tree_attr(cls.tree)
+
+                models = []
+                for c in rf_classifier.classifiers:
+                    probabilities = []
+                    instance_predictions = []
+                    instance_classes = []
+                    for ex in test if three_folds else train:
+                        ex = data.Instance(ex)
+                        instance_classes.append(ex.get_class().value)
+                        ex.setclass("?")
+                        cl, prob = c(ex, c.GetBoth)
+                        if cl.isSpecial():
+                            raise "Classifier %s returned unknown value" % c.name
+                        probabilities.append(list(prob))
+                        instance_predictions.append(cl.value)
+
+                    models.append(Model("RF",
+                                        c,
+                                        np.array(probabilities),
+                                        {val: k for k, val in enumerate(test.domain.class_var.values)},
+                                        get_features(c, test.domain),
+                                        np.array(instance_predictions),
+                                        np.array(instance_classes),
+                                        XAnchors=None,
+                                        YAnchors=None))
+
+                rv.append((models, rf_classifier, self.data_d.select(indices, remaining_folds[(j + 1) % 2]) if three_folds else test))
+
+        return rv[0] if len(rv) == 1 else rv
 
 
     def _print_time(self, time_start, iter, numiter):

File examples/ensemble/build_rf.py

 
     build_map = mm.BuildModelMap(fname)
 
-    trees = 200
-    depth = 1000
+    trees = 150
 
     print "build models..."
-    models, models_2, rf_classifier, _ = build_map.build_rf_models(trees=trees, max_depth=depth)
+    models, models_1, rf_classifier, _ = build_map.build_rf_models(trees=trees, max_depth=None, three_folds=False)
 
     print "build model data..."
     table = build_map.build_model_data(models)
-    table_2 = build_map.build_model_data(models_2)
+    table_1 = build_map.build_model_data(models_1)
 
     print "build matrix..."
     smx = build_map.build_model_matrix(models)
-    smx_2 = build_map.build_model_matrix(models_2)
-    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_depth_None_%s" % (DATASET, len(models), sys.platform)), smx, table, build_map.data())
-    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_depth_None_%s" % (DATASET, len(models_2), sys.platform)), smx_2, table_2, build_map.data())
+    smx_1 = build_map.build_model_matrix(models_1)
+    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_tree_base_%s" % (DATASET, len(models), sys.platform)), smx, table, build_map.data())
+    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_tree_base_%s" % (DATASET, len(models_1), sys.platform)), smx_1, table_1, build_map.data())
 
 #build_rd_map("zoo")
 #build_rd_map("marketing")

File examples/ensemble/mm_forest.py

 import matplotlib
 matplotlib.use('Agg')
 
-import math, os.path, random, re, sys
+import os.path, random, re, sys
 import cPickle as pickle
 import matplotlib.pyplot as plt
 import numpy as np
 from itertools import groupby
 from operator import itemgetter
 
-from Orange import clustering, data, distance, ensemble, evaluation, network, utils
-from Orange.classification.tree import SimpleTreeLearner
+from Orange import clustering, distance, evaluation, utils
 
 import _modelmaps as mm
 
 ROOT = "/home/miha/work/res/modelmaps"
-#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
-#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
 
 scores = {}
 cache = {}
 
+NTREES = 1000
+MAXCLUSTERS = 120
+EXPMARKER = "1000_120_2fold_tree_base"
+
 def plot_scores(DATASET):
     print "drawing plots..."
     fig = plt.figure(figsize=(6, 8), dpi=300)
         for label in ax.get_yticklabels():
             label.set_fontsize('small')
 
+        ax.set_ybound(0., 1.)
+
         ax.set_xlabel('trees')
         ax.set_ylabel(type.upper())
         subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
     add_scores_plot(1, "auc")
     add_scores_plot(2, "brier")
 
-    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_m3.png" % (DATASET, 2, len(scores[DATASET]["ca"]))))
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%s.png" % (EXPMARKER, DATASET)))
 
 
 def score_trees(rf_classifier, trees, test, DATASET, cluster_indices=None):
     score = scores.get(DATASET, {"ca": [], "auc": [], "brier": [], "ca_mm": [], "auc_mm": [], "brier_mm": []})
 
     if cluster_indices is None:
-        models = [random.sample(trees, i) for i in range(2, 120)]
+        models = [random.sample(trees, i) for i in range(2, min(MAXCLUSTERS * 2, len(trees)))]
     else:
         models = [[trees[random.choice(cluster)] for cluster in clustering] for clustering in cluster_indices]
 
     for trees in models:
         rf_classifier.classifiers = trees
-
-        #rf_classifier.classifiers = trees[:i]
         classifiers = [rf_classifier]
 
-        #    results = evaluation.testing.learn_and_test_on_test_data(learners, train, test)
-
         test_results = evaluation.testing.ExperimentResults(1,
             classifier_names = ["RF"],
             domain=test.domain,
 
         build_map = mm.BuildModelMap(fname)
 
-        ntrees = 500
-
         print "build models..."
-        models, rf_classifier, test = build_map.build_rf_models(trees=ntrees, max_depth=depth)
+        models, rf_classifier, test = build_map.build_rf_models(trees=NTREES, max_depth=depth, three_folds=False)
 
         print "build model data..."
         table = build_map.build_model_data(models)
 
-        #    print "build matrix..."
-        #    smx = build_map.build_model_matrix(models)
-        #
-        #    random.shuffle(rf_classifier.classifiers)
-        #
-        #    graph = network.Graph()
-        #    graph.add_nodes_from(range(smx.dim))
-        #    graph.set_items(table)
-        #
-        #    # connect each node with the closest neighbor
-        #    edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, 1)
-        #    graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))
-        #
-        #    nodes_left = float(graph.number_of_nodes())
-        #    components = network.nx.connected_components(graph)
-        #    probs = [len(c) / nodes_left for c in components]
-        #
-        #    tree_indices = []
-        #    while nodes_left > 0:
-        #        for p, c in zip(probs, components):
-        #            if random.random() < p:
-        #                tree_indices.append(random.choice(c))
-        #                c.remove(tree_indices[-1])
-        #                nodes_left -= 0
-        #
-        #                if nodes_left <= 0:
-        #                    break
-        #
-        #                for i, c in enumerate(components):
-        #                    probs[i] = len(c) / nodes_left
-
+        #print "build matrix..."
+        #smx = build_map.build_model_matrix(models)
 
         class ModelDistanceConstructor(distance.DistanceConstructor):
 
             def __call__(self, e1, e2):
                 return mm.distance_manhattan(e1["model"].value, e2["model"].value)
 
-        def data_center(table):
+        def data_center(table, original=table):
+            if len(table) == 0:
+                print "e",
+                sys.stdout.flush()
+                table = original
             onemodel = table[0]["model"].value
             model = mm.Model("RF", None,
                 np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
         clustering.kmeans.data_center = data_center
 
         table.shuffle()
-        #best_projs = []
         cluster_indices = []
 
         print "kmeans..."
-        for c in range(2, 120):
-        #for c in range(2, 10):
+        for c in range(2, MAXCLUSTERS + 1):
+            print c,
+            sys.stdout.flush()
             kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
             clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
 
-            #best_projs_tmp = []
             cluster_indices_tmp = []
             for k, g in groupby(clusters_, key=itemgetter(0)):
                 _, indices = zip(*list(g))
-                #c, i = random.choice(g)
-                #best_projs_tmp.append(i)
                 cluster_indices_tmp.append(indices)
 
-            #best_projs.append(best_projs_tmp)
             cluster_indices.append(cluster_indices_tmp)
 
         trees = [ex["model"].value.classifier for ex in table]
 
     print "pickle scores...",
     sys.stdout.flush()
-    pickle.dump(scores, open(os.path.join(ROOT, "_ensemble_", "scores_%d.pkl" % DATASET), "wb"), -1)
+    pickle.dump(scores, open(os.path.join(ROOT, "_ensemble_", "scores_%s_%s.pkl" % (EXPMARKER, DATASET)), "wb"), -1)
     print "done"
 
-    plot_scores(DATASET)
+    #plot_scores(DATASET)
 
 
-DO = sys.argv[1:]
+if __name__ == "__main__":
+    DO = sys.argv[1:]
 
-if len(DO) == 0:
-    DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
-    DO = ["marketing"]
+    if len(DO) == 0:
+        DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
+        DO = ["breast-cancer-wisconsin", "iris"]
 
-for i in range(200):
-    print i
-    for d in DO:
-        plot_trees_score(d)
+    for i in range(500):
+        print i
+        for d in DO:
+            plot_trees_score(d)

File examples/ensemble/parse_mm_forest.py

 import matplotlib
 matplotlib.use('Agg')
 
-import math, os.path, random, re, sys
+import os.path, re
 import cPickle as pickle
 import matplotlib.pyplot as plt
+import scipy.stats
 import numpy as np
 
 from itertools import groupby
 from operator import itemgetter
 
-from Orange import clustering, data, distance, ensemble, evaluation, network, utils
-from Orange.classification.tree import SimpleTreeLearner
-
-import _modelmaps as mm
-
 ROOT = "/home/miha/work/res/modelmaps"
-#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
 #ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
 
+scores = {}
+
+def plot_scores(scoring_method):
+    print "drawing plots..."
+    fig = plt.figure(figsize=(6, 8), dpi=300)
+    fig.subplots_adjust(wspace=0.4, hspace=0.6, top=0.976, bottom=0.017, left=0.074, right=0.983)
+
+    def add_scores_plot(i, DATASET):
+
+        ax = fig.add_subplot(4, 3, i)
+
+        scores[DATASET][scoring_method].sort()
+        x, y, std, med = [], [], [], []
+        for k, g in groupby(scores[DATASET][scoring_method], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(np.mean(s))
+            std.append(np.std(s))
+            med.append(np.median(s))
+
+        last_x = x[-1]
+        last_y = y[-1]
+
+        y = np.array(y)
+        std = np.array(std)
+
+        ax.plot(x, y, '-', color='k', linewidth=0.5)
+        #ax.plot(x, med, '--', color='k', linewidth=0.5)
+        #ax.plot(x, y + std, '-', color='k', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='k', linewidth=0.1)
+
+        miny, maxy = min(y), max(y)
+
+        scores[DATASET][scoring_method + "_mm"].sort()
+        x, y, std, med = [], [], [], []
+        for k, g in groupby(scores[DATASET][scoring_method + "_mm"], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+            std.append(np.std(s))
+            med.append(np.median(s))
+
+        y.append(last_y)
+        x.append(last_x)
+
+        y = np.array(y)
+        std = np.array(std)
+
+        ax.plot(x, y, '-', color='g', linewidth=0.5)
+        #ax.plot(x, med, '--', color='g', linewidth=0.5)
+        #ax.plot(x, y + std, '-', color='g', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='g', linewidth=0.1)
+
+        miny = min(y) if min(y) < miny else miny
+        maxy = max(y) if max(y) > maxy else maxy
+
+        ax.set_yticks([round(miny - 0.005, 2), round(maxy + 0.005, 2)])
+        ax.set_ybound(round(miny - 0.005, 2), round(maxy + 0.005, 2))
+
+        for label in ax.get_xticklabels():
+            label.set_fontsize('xx-small')
+
+        for label in ax.get_yticklabels():
+            label.set_fontsize('xx-small')
+
+        #ax.set_ybound(0., 1.)
+        ax.set_xlabel('trees', size='small')
+        ax.set_ylabel(scoring_method.upper(), size='small')
+        subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
+        ax.set_title('%s' % subtitle, weight='bold', size='small')
+
+    counter = 1
+    for DATASET in sorted(scores.keys()):
+        if DATASET == "marketing":
+            continue
+
+        add_scores_plot(counter, DATASET)
+        counter += 1
+
+    ax = fig.add_subplot(4, 3, counter)
+
+    ax.plot([0], [0], 'k-', [0], [0], 'g-')
+    ax.set_axis_off()
+
+    plt.legend(["Random Forest", "Model-map-based forest"], frameon=False)
+    leg = plt.gca().get_legend()
+    ltext = leg.get_texts()
+    plt.setp(ltext, fontsize='xx-small')
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "res_%s_%s.pdf" % (EXPMARKER, scoring_method)))
+
+def plot_one(DATASET, scoring_method, plot_model_map=True):
+    print "drawing plots..."
+    fig = plt.figure(figsize=(3, 2), dpi=300)
+    fig.subplots_adjust(wspace=0.4, hspace=0.6, top=0.9, bottom=0.15, left=0.15, right=0.96)
+
+    ax = fig.add_subplot(111)
+
+    scores[DATASET][scoring_method].sort()
+    x, y, std, med = [], [], [], []
+    for k, g in groupby(scores[DATASET][scoring_method], key=itemgetter(0)):
+        i, s = zip(*list(g))
+        x.append(i[0])
+        y.append(np.mean(s))
+        std.append(np.std(s))
+        med.append(np.median(s))
+
+    y = np.array(y)
+    std = np.array(std)
+
+    ax.plot(x, y, '-', color='k', linewidth=0.5, label='Random Forest')
+    if plot_model_map:
+        #ax.plot(x, y + std, '-', color='k', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='k', linewidth=0.1)
+        pass
+
+    miny, maxy = min(y), max(y)
+
+    if plot_model_map:
+        scores[DATASET][scoring_method + "_mm"].sort()
+        x, y, std, med = [], [], [], []
+        for k, g in groupby(scores[DATASET][scoring_method + "_mm"], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+            std.append(np.std(s))
+            med.append(np.median(s))
+
+        y = np.array(y)
+        std = np.array(std)
+
+        ax.plot(x, y, '-', color='g', linewidth=0.5, label='Model-map-based forest')
+        #ax.plot(x, y + std, '-', color='g', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='g', linewidth=0.1)
+
+        miny = min(y) if min(y) < miny else miny
+        maxy = max(y) if max(y) > maxy else maxy
+
+    ax.set_yticks([round(miny - 0.005, 2), round(maxy + 0.005, 2)-0.15])
+    ax.set_ybound(round(miny - 0.005, 2), round(maxy + 0.005, 2)-0.15)
+
+    for label in ax.get_xticklabels():
+        label.set_fontsize('xx-small')
+
+    for label in ax.get_yticklabels():
+        label.set_fontsize('xx-small')
+
+    if plot_model_map:
+        leg = plt.legend(frameon=False)
+        leg = plt.gca().get_legend()
+        ltext = leg.get_texts()
+        plt.setp(ltext, fontsize='x-small')
+
+    ax.set_xlabel('trees', size='small')
+    ax.set_ylabel(scoring_method.upper(), size='small')
+    subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
+    ax.set_title('%s' % subtitle, weight='bold', size='small')
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "res_%s_%s_%s.pdf" % (EXPMARKER, DATASET, scoring_method)))
+
+EXPMARKER = "120_120_3fold_tree_base"
+DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
+
+for DATASET in DO:
+    fname =os.path.join(ROOT, "_ensemble_", "scores_%s_%s.pkl" % (EXPMARKER, DATASET))
+    print fname
+    if os.path.exists(fname) and os.path.isfile(fname):
+        scores.update(pickle.load(open(fname, "rb")))
+
+#plot_scores("ca")
+#plot_scores("auc")
+plot_scores("brier")
+
+plot_one("marketing", "brier", plot_model_map=True)
+#plot_one("zoo", "brier", plot_model_map=False)
+
+
+def best_scores(scores):
+    x = []
+    scores.sort()
+    for k, g in groupby(scores, key=itemgetter(0)):
+        i, s = zip(*list(g))
+        x.append((i[0], np.mean(s)))
+
+    best = min(x, key=itemgetter(1))[0]
+    print "best", best
+    return [s for t, s in scores if t == best]
+
+def average_scores(scores):
+    x = []
+    scores.sort()
+    for k, g in groupby(scores, key=itemgetter(0)):
+        i, s = zip(*list(g))
+        x.append(np.mean(s))
+
+    return np.array(x)
+
+x = average_scores(scores["marketing"]["brier"])
+y = average_scores(scores["marketing"]["brier_mm"])
+
+trim = min(len(x), len(y))
+print np.sum(x[:trim] - y[:trim])
+
+ranks = scipy.stats.mstats.rankdata(np.array([x[:trim], y[:trim]]), axis=0)
+print "\n".join("%s: %.3f" % (name, r) for r, name in zip(np.mean(ranks, axis=1), ["RF", "MM forest"]))
+print scipy.stats.wilcoxon(x[:trim], y[:trim])
+
+x = best_scores(scores["marketing"]["brier"])
+y = best_scores(scores["marketing"]["brier_mm"])
+
+print "Marketing"
+print "best RF:", max(x)
+print "best MM:", max(y)
+print "Student's t-test:", scipy.stats.ttest_ind(x, y)
+
+for scoring_method in ["ca", "auc", "brier"]:
+    print scoring_method.upper()
+    x, y = [], []
+    for DATASET in scores:
+
+        trim = min(len(list(average_scores(scores[DATASET][scoring_method]))), len(list(average_scores(scores[DATASET][scoring_method+ "_mm"]))))
+        x.extend(list(average_scores(scores[DATASET][scoring_method]))[:trim])
+        y.extend(list(average_scores(scores[DATASET][scoring_method + "_mm"]))[:trim])
+
+    print "Ranks"
+    ranks = scipy.stats.mstats.rankdata(np.array([x, y]), axis=0)
+    print "\n".join("%s: %.3f" % (name, r) for r, name in zip(np.mean(ranks, axis=1), ["RF", "MM forest"]))
+    print scipy.stats.wilcoxon(x, y)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

File examples/projections/parse_entropy.py

 
     ax.set_xbound(1, len(x))
     y_all = np.array(y_all)
-    ax.set_ybound(np.min(y_all), np.max(y_all) + 0.003)
+    #ax.set_ybound(np.min(y_all), np.max(y_all) + 0.003)
+    ax.set_ybound(0, np.max(y_all) + 0.003)
 
     subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", dataset)])
     ax.set_title(r"%s" % (subtitle.replace("Sample", "")), weight='bold', size='large', position=(0.5, 1.1),
 
 for method in methods:
     #print_results(method)
-    #draw_graphs(method, "entropy", r"$H(P_1 \ldots P_i)$", "entropy_joint")
-    #draw_graphs(method, "uncertainty", r"$H(Y \mid P_1 \ldots P_i)$", "entropy_remaining")
+    draw_graphs(method, "entropy", r"$H(P_1 \ldots P_i)$", "entropy_joint")
+    draw_graphs(method, "uncertainty", r"$H(Y \mid P_1 \ldots P_i)$", "entropy_remaining")
     print
 
-for top in [5, 10, 15]:
-    vals = {}
-    for name, abbr in algorithms:
-        vals[abbr] = []
-
-        for method in methods:
-            for dataset in res.iterkeys():
-                if dataset.lower() in ["car", "wine", "lenses", "zoo"]:
-                    continue
-
-                vals[abbr].append(res[dataset][method]["%s_uncertainty" % abbr][top - 1])
-
-    print "Considering top %d projections: Friedman Chi Square Test" % top
-    print "Chi Square: %.3f; p-value: %e" % scipy.stats.friedmanchisquare(*[np.array(val) for val in vals.itervalues()])
-    print
-    print "Ranks"
-    ranks = scipy.stats.mstats.rankdata(np.array([vals[abbr] for name, abbr in algorithms]), axis=0)
-    print "\n".join("%s: %.3f" % (name, r) for r, (name, abbr) in zip(np.mean(ranks, axis=1), algorithms))
-    print
-    print "Critical distance (Nemenyi)"
-    CD = 2.850 * np.sqrt(6. * 7 / 6 / 30)
-    print CD
-
-    cd = evaluation.scoring.compute_CD(list(np.mean(ranks, axis=1)), 30)
-    print "Orange CD:", cd
-    print
-    print
-
-
-    evaluation.scoring.graph_ranks(os.path.join(ROOT, "_projections_", "dermatology-cd-%d.png" % top), np.mean(ranks, axis=1), zip(*algorithms)[0], cd=cd, width=7, textspace=2.2)
+#for top in [5, 10, 15]:
+#    vals = {}
+#    for name, abbr in algorithms:
+#        vals[abbr] = []
+#
+#        for method in methods:
+#            for dataset in res.iterkeys():
+#                if dataset.lower() in ["car", "wine", "lenses", "zoo"]:
+#                    continue
+#
+#                vals[abbr].append(res[dataset][method]["%s_uncertainty" % abbr][top - 1])
+#
+#    print "Considering top %d projections: Friedman Chi Square Test" % top
+#    print "Chi Square: %.3f; p-value: %e" % scipy.stats.friedmanchisquare(*[np.array(val) for val in vals.itervalues()])
+#    print
+#    print "Ranks"
+#    ranks = scipy.stats.mstats.rankdata(np.array([vals[abbr] for name, abbr in algorithms]), axis=0)
+#    print "\n".join("%s: %.3f" % (name, r) for r, (name, abbr) in zip(np.mean(ranks, axis=1), algorithms))
+#    print
+#    print "Critical distance (Nemenyi)"
+#    CD = 2.850 * np.sqrt(6. * 7 / 6 / 30)
+#    print CD
+#
+#    cd = evaluation.scoring.compute_CD(list(np.mean(ranks, axis=1)), 30)
+#    print "Orange CD:", cd
+#    print
+#    print
+#
+#
+#    evaluation.scoring.graph_ranks(os.path.join(ROOT, "_projections_", "dermatology-cd-%d.png" % top), np.mean(ranks, axis=1), zip(*algorithms)[0], cd=cd, width=7, textspace=2.2)

File rtd_requirements.txt

+numpydoc==0.4
+Orange
+setuptools==0.6c11
+numpy==1.6.2
+PIL==1.1.6
+networkx==1.7