Commits

Miha Stajdohar  committed 2d48c86

Build model-map-based forest and compare it with RF.

  • Participants
  • Parent commits 407dae6

Comments (0)

Files changed (1)

File examples/ensemble/mm_forest.py

+import matplotlib
+matplotlib.use('Agg')
+
+import math, os.path, random, re, sys
+import cPickle as pickle
+import matplotlib.pyplot as plt
+import numpy as np
+
+from itertools import groupby
+from operator import itemgetter
+
+from Orange import clustering, data, distance, ensemble, evaluation, network, utils
+from Orange.classification.tree import SimpleTreeLearner
+
+import _modelmaps as mm
+
+ROOT = "/home/miha/work/res/modelmaps"
+#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+
+scores = {}
+cache = {}
+
+def plot_scores(DATASET):
+    print "drawing plots..."
+    fig = plt.figure(figsize=(6, 8), dpi=300)
+    fig.subplots_adjust(wspace=0.0, hspace=0.6, top=0.95, bottom=0.06, left=0.1, right=0.97)
+
+    def add_scores_plot(i, type):
+
+        ax = fig.add_subplot(3, 1, i)
+
+        scores[DATASET][type].sort()
+        x, y = [], []
+        for k, g in groupby(scores[DATASET][type], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+
+        ax.plot(x, y, '-', color='k', linewidth=1)
+
+        scores[DATASET][type + "_mm"].sort()
+        x, y = [], []
+        for k, g in groupby(scores[DATASET][type + "_mm"], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+
+        ax.plot(x, y, '--', color='g', linewidth=0.5)
+
+        for label in ax.get_xticklabels():
+            label.set_fontsize('small')
+
+        for label in ax.get_yticklabels():
+            label.set_fontsize('small')
+
+        ax.set_xlabel('trees')
+        ax.set_ylabel(type.upper())
+        subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
+        ax.set_title('%s' % subtitle, weight='bold')
+
+    add_scores_plot(0, "ca")
+    add_scores_plot(1, "auc")
+    add_scores_plot(2, "brier")
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_m3.png" % (DATASET, 2, len(scores[DATASET]["ca"]))))
+
+
+def score_trees(rf_classifier, trees, test, DATASET, cluster_indices=None):
+
+    score = scores.get(DATASET, {"ca": [], "auc": [], "brier": [], "ca_mm": [], "auc_mm": [], "brier_mm": []})
+
+    if cluster_indices is None:
+        models = [random.sample(trees, i) for i in range(2, 120)]
+    else:
+        models = [[trees[random.choice(cluster)] for cluster in clustering] for clustering in cluster_indices]
+
+    for trees in models:
+        rf_classifier.classifiers = trees
+
+        #rf_classifier.classifiers = trees[:i]
+        classifiers = [rf_classifier]
+
+        #    results = evaluation.testing.learn_and_test_on_test_data(learners, train, test)
+
+        test_results = evaluation.testing.ExperimentResults(1,
+            classifier_names = ["RF"],
+            domain=test.domain,
+            test_type = evaluation.testing.TEST_TYPE_SINGLE,
+            weights=0)
+
+        test_results.results = [test_results.create_tested_example(0, example) for j, example in enumerate(test)]
+
+        results = evaluation.testing._default_evaluation._test_on_data(classifiers, test)
+
+        for example, classifier, result in results:
+            test_results.results[example].set_result(classifier, *result)
+
+        score["ca_mm" if cluster_indices else "ca"].append((len(trees), evaluation.scoring.CA(test_results)[0]))
+        score["auc_mm" if cluster_indices else "auc"].append((len(trees), evaluation.scoring.AUC(test_results)[0]))
+        score["brier_mm" if cluster_indices else "brier"].append((len(trees), evaluation.scoring.Brier_score(test_results)[0]))
+
+    scores[DATASET] = score
+
+def plot_trees_score(DATASET, depth=None, seed=0):
+
+    print "DATASET:", DATASET
+
+    if not DATASET in cache:
+        fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+            if not (os.path.exists(fname) and os.path.isfile(fname)):
+                raise IOError("File %s not found." % fname)
+
+        build_map = mm.BuildModelMap(fname)
+
+        ntrees = 500
+
+        print "build models..."
+        models, rf_classifier, test = build_map.build_rf_models(trees=ntrees, max_depth=depth)
+
+        print "build model data..."
+        table = build_map.build_model_data(models)
+
+        #    print "build matrix..."
+        #    smx = build_map.build_model_matrix(models)
+        #
+        #    random.shuffle(rf_classifier.classifiers)
+        #
+        #    graph = network.Graph()
+        #    graph.add_nodes_from(range(smx.dim))
+        #    graph.set_items(table)
+        #
+        #    # connect each node with the closest neighbor
+        #    edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, 1)
+        #    graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))
+        #
+        #    nodes_left = float(graph.number_of_nodes())
+        #    components = network.nx.connected_components(graph)
+        #    probs = [len(c) / nodes_left for c in components]
+        #
+        #    tree_indices = []
+        #    while nodes_left > 0:
+        #        for p, c in zip(probs, components):
+        #            if random.random() < p:
+        #                tree_indices.append(random.choice(c))
+        #                c.remove(tree_indices[-1])
+        #                nodes_left -= 0
+        #
+        #                if nodes_left <= 0:
+        #                    break
+        #
+        #                for i, c in enumerate(components):
+        #                    probs[i] = len(c) / nodes_left
+
+
+        class ModelDistanceConstructor(distance.DistanceConstructor):
+
+            def __new__(cls, data=None):
+                self = distance.DistanceConstructor.__new__(cls)
+                return self.__call__(data) if data else self
+
+            def __call__(self, table):
+                return ModelDistance()
+
+        class ModelDistance(distance.Distance):
+            def __call__(self, e1, e2):
+                return mm.distance_manhattan(e1["model"].value, e2["model"].value)
+
+        def data_center(table):
+            onemodel = table[0]["model"].value
+            model = mm.Model("RF", None,
+                np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
+                onemodel.class_values, [],
+                [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
+                [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
+
+            return model.get_instance(table.domain)
+
+        table.random_generator = seed
+        clustering.kmeans.data_center = data_center
+
+        table.shuffle()
+        #best_projs = []
+        cluster_indices = []
+
+        print "kmeans..."
+        for c in range(2, 120):
+        #for c in range(2, 10):
+            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+
+            #best_projs_tmp = []
+            cluster_indices_tmp = []
+            for k, g in groupby(clusters_, key=itemgetter(0)):
+                _, indices = zip(*list(g))
+                #c, i = random.choice(g)
+                #best_projs_tmp.append(i)
+                cluster_indices_tmp.append(indices)
+
+            #best_projs.append(best_projs_tmp)
+            cluster_indices.append(cluster_indices_tmp)
+
+        trees = [ex["model"].value.classifier for ex in table]
+
+        cache[DATASET] = {}
+        cache[DATASET]["rf"] = rf_classifier
+        cache[DATASET]["trees"] = trees
+        cache[DATASET]["test"] = test
+        cache[DATASET]["cluster_indices"] = cluster_indices
+    else:
+        rf_classifier = cache[DATASET]["rf"]
+        trees = cache[DATASET]["trees"]
+        test = cache[DATASET]["test"]
+        cluster_indices = cache[DATASET]["cluster_indices"]
+
+    print "score trees in RF..."
+    score_trees(rf_classifier, trees, test, DATASET)
+
+    print "score trees in MM..."
+    score_trees(rf_classifier, trees, test, DATASET, cluster_indices)
+
+    print "pickle scores...",
+    sys.stdout.flush()
+    pickle.dump(scores, open(os.path.join(ROOT, "_ensemble_", "scores_%d.pkl" % DATASET), "wb"), -1)
+    print "done"
+
+    plot_scores(DATASET)
+
+
+DO = sys.argv[1:]
+
+if len(DO) == 0:
+    DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
+    DO = ["marketing"]
+
+for i in range(200):
+    print i
+    for d in DO:
+        plot_trees_score(d)