Commits

Miha Stajdohar committed 7984932

Renamed.

Comments (0)

Files changed (9)

examples/ensemble/build_rf.py

-__author__ = '"Miha Stajdohar" <miha.stajdohar@gmail.com>'
-
-import matplotlib
-matplotlib.use('Agg')
-
-import os.path, sys
-import numpy as np
-import orangecontrib.modelmaps as mm
-#import cPickle as pickle
-
-from Orange import data, utils
-
-ROOT = "/home/miha/work/res/modelmaps"
-ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
-#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
-
-def build_rd_map(DATASET):
-    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
-
-    if not (os.path.exists(fname) and os.path.isfile(fname)):
-        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
-
-        if not (os.path.exists(fname) and os.path.isfile(fname)):
-            raise IOError("File %s not found." % fname)
-
-    build_map = mm.BuildModelMap(fname)
-
-    trees = 150
-
-    print "build models..."
-    models, models_1, rf_classifier, _ = build_map.build_rf_models(trees=trees, max_depth=None, three_folds=False)
-
-    print "build model data..."
-    table = build_map.build_model_data(models)
-    table_1 = build_map.build_model_data(models_1)
-
-    print "build matrix..."
-    smx = build_map.build_model_matrix(models)
-    smx_1 = build_map.build_model_matrix(models_1)
-    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_tree_base_%s" % (DATASET, len(models), sys.platform)), smx, table, build_map.data())
-    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_tree_base_%s" % (DATASET, len(models_1), sys.platform)), smx_1, table_1, build_map.data())
-
-#build_rd_map("zoo")
-#build_rd_map("marketing")
-#build_rd_map("vehicle")
-#build_rd_map("iris")
-#build_rd_map("voting")
-
-DO = ["iris", "breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "marketing", "primary-tumor", "vehicle", "wdbc", "dermatology"]
-DO = ["marketing"]
-
-for d in DO:
-    build_rd_map(d)

examples/ensemble/mm_forest.py

-import matplotlib
-matplotlib.use('Agg')
-
-import os.path, random, re, sys
-import cPickle as pickle
-import matplotlib.pyplot as plt
-import numpy as np
-
-from itertools import groupby
-from operator import itemgetter
-
-from Orange import clustering, distance, evaluation, utils
-
-import orangecontrib.modelmaps as mm
-
-ROOT = "/home/miha/work/res/modelmaps"
-ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
-ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
-
-scores = {}
-cache = {}
-
-NTREES = 1000
-MAXCLUSTERS = 120
-EXPMARKER = "1000_120_2fold_tree_base"
-
-def plot_scores(DATASET):
-    print "drawing plots..."
-    fig = plt.figure(figsize=(6, 8), dpi=300)
-    fig.subplots_adjust(wspace=0.0, hspace=0.6, top=0.95, bottom=0.06, left=0.1, right=0.97)
-
-    def add_scores_plot(i, type):
-
-        ax = fig.add_subplot(3, 1, i)
-
-        scores[DATASET][type].sort()
-        x, y = [], []
-        for k, g in groupby(scores[DATASET][type], key=itemgetter(0)):
-            i, s = zip(*list(g))
-            x.append(i[0])
-            y.append(sum(s) / len(s))
-
-        ax.plot(x, y, '-', color='k', linewidth=1)
-
-        scores[DATASET][type + "_mm"].sort()
-        x, y = [], []
-        for k, g in groupby(scores[DATASET][type + "_mm"], key=itemgetter(0)):
-            i, s = zip(*list(g))
-            x.append(i[0])
-            y.append(sum(s) / len(s))
-
-        ax.plot(x, y, '--', color='g', linewidth=0.5)
-
-        for label in ax.get_xticklabels():
-            label.set_fontsize('small')
-
-        for label in ax.get_yticklabels():
-            label.set_fontsize('small')
-
-        ax.set_ybound(0., 1.)
-
-        ax.set_xlabel('trees')
-        ax.set_ylabel(type.upper())
-        subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
-        ax.set_title('%s' % subtitle, weight='bold')
-
-    add_scores_plot(0, "ca")
-    add_scores_plot(1, "auc")
-    add_scores_plot(2, "brier")
-
-    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%s.png" % (EXPMARKER, DATASET)))
-
-
-def score_trees(rf_classifier, trees, test, DATASET, cluster_indices=None):
-
-    score = scores.get(DATASET, {"ca": [], "auc": [], "brier": [], "ca_mm": [], "auc_mm": [], "brier_mm": []})
-
-    if cluster_indices is None:
-        models = [random.sample(trees, i) for i in range(2, min(MAXCLUSTERS * 2, len(trees)))]
-    else:
-        models = [[trees[random.choice(cluster)] for cluster in clustering] for clustering in cluster_indices]
-
-    for trees in models:
-        rf_classifier.classifiers = trees
-        classifiers = [rf_classifier]
-
-        test_results = evaluation.testing.ExperimentResults(1,
-            classifier_names = ["RF"],
-            domain=test.domain,
-            test_type = evaluation.testing.TEST_TYPE_SINGLE,
-            weights=0)
-
-        test_results.results = [test_results.create_tested_example(0, example) for j, example in enumerate(test)]
-
-        results = evaluation.testing._default_evaluation._test_on_data(classifiers, test)
-
-        for example, classifier, result in results:
-            test_results.results[example].set_result(classifier, *result)
-
-        score["ca_mm" if cluster_indices else "ca"].append((len(trees), evaluation.scoring.CA(test_results)[0]))
-        score["auc_mm" if cluster_indices else "auc"].append((len(trees), evaluation.scoring.AUC(test_results)[0]))
-        score["brier_mm" if cluster_indices else "brier"].append((len(trees), evaluation.scoring.Brier_score(test_results)[0]))
-
-    scores[DATASET] = score
-
-def plot_trees_score(DATASET, depth=None, seed=0):
-
-    print "DATASET:", DATASET
-
-    if not DATASET in cache:
-        fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
-
-        if not (os.path.exists(fname) and os.path.isfile(fname)):
-            fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
-
-            if not (os.path.exists(fname) and os.path.isfile(fname)):
-                raise IOError("File %s not found." % fname)
-
-        build_map = mm.BuildModelMap(fname)
-
-        print "build models..."
-        models, rf_classifier, test = build_map.build_rf_models(trees=NTREES, max_depth=depth, three_folds=False)
-
-        print "build model data..."
-        table = build_map.build_model_data(models)
-
-        #print "build matrix..."
-        #smx = build_map.build_model_matrix(models)
-
-        class ModelDistanceConstructor(distance.DistanceConstructor):
-
-            def __new__(cls, data=None):
-                self = distance.DistanceConstructor.__new__(cls)
-                return self.__call__(data) if data else self
-
-            def __call__(self, table):
-                return ModelDistance()
-
-        class ModelDistance(distance.Distance):
-            def __call__(self, e1, e2):
-                return mm.distance_manhattan(e1["model"].value, e2["model"].value)
-
-        def data_center(table, original=table):
-            if len(table) == 0:
-                print "e",
-                sys.stdout.flush()
-                table = original
-            onemodel = table[0]["model"].value
-            model = mm.Model("RF", None,
-                np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
-                onemodel.class_values, [],
-                [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
-                [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
-
-            return model.get_instance(table.domain)
-
-        table.random_generator = seed
-        clustering.kmeans.data_center = data_center
-
-        table.shuffle()
-        cluster_indices = []
-
-        print "kmeans..."
-        for c in range(2, MAXCLUSTERS + 1):
-            print c,
-            sys.stdout.flush()
-            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
-            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
-
-            cluster_indices_tmp = []
-            for k, g in groupby(clusters_, key=itemgetter(0)):
-                _, indices = zip(*list(g))
-                cluster_indices_tmp.append(indices)
-
-            cluster_indices.append(cluster_indices_tmp)
-
-        trees = [ex["model"].value.classifier for ex in table]
-
-        cache[DATASET] = {}
-        cache[DATASET]["rf"] = rf_classifier
-        cache[DATASET]["trees"] = trees
-        cache[DATASET]["test"] = test
-        cache[DATASET]["cluster_indices"] = cluster_indices
-    else:
-        rf_classifier = cache[DATASET]["rf"]
-        trees = cache[DATASET]["trees"]
-        test = cache[DATASET]["test"]
-        cluster_indices = cache[DATASET]["cluster_indices"]
-
-    print "score trees in RF..."
-    score_trees(rf_classifier, trees, test, DATASET)
-
-    print "score trees in MM..."
-    score_trees(rf_classifier, trees, test, DATASET, cluster_indices)
-
-    print "pickle scores...",
-    sys.stdout.flush()
-    pickle.dump(scores, open(os.path.join(ROOT, "_ensemble_", "scores_%s_%s.pkl" % (EXPMARKER, DATASET)), "wb"), -1)
-    print "done"
-
-    #plot_scores(DATASET)
-
-
-if __name__ == "__main__":
-    DO = sys.argv[1:]
-
-    if len(DO) == 0:
-        DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
-        DO = ["breast-cancer-wisconsin", "iris"]
-
-    for i in range(500):
-        print i
-        for d in DO:
-            plot_trees_score(d)

examples/ensemble/parse_mm_forest.py

-import matplotlib
-matplotlib.use('Agg')
-
-import os.path, re
-import cPickle as pickle
-import matplotlib.pyplot as plt
-import scipy.stats
-import numpy as np
-
-from itertools import groupby
-from operator import itemgetter
-
-ROOT = "/Users/miha/work/res/modelmaps"
-#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
-#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
-
-scores = {}
-
-def plot_scores(scoring_method):
-    print "drawing plots..."
-    fig = plt.figure(figsize=(6, 8), dpi=300)
-    fig.subplots_adjust(wspace=0.4, hspace=0.6, top=0.976, bottom=0.017, left=0.074, right=0.983)
-
-    def add_scores_plot(i, DATASET):
-
-        ax = fig.add_subplot(4, 3, i)
-
-        scores[DATASET][scoring_method].sort()
-        x, y, std, med = [], [], [], []
-        for k, g in groupby(scores[DATASET][scoring_method], key=itemgetter(0)):
-            i, s = zip(*list(g))
-            x.append(i[0])
-            y.append(np.mean(s))
-            std.append(np.std(s))
-            med.append(np.median(s))
-
-        last_x = x[-1]
-        last_y = y[-1]
-
-        y = np.array(y)
-        std = np.array(std)
-
-        ax.plot(x, y, '-', color='k', linewidth=0.5)
-        #ax.plot(x, med, '--', color='k', linewidth=0.5)
-        #ax.plot(x, y + std, '-', color='k', linewidth=0.1)
-        #ax.plot(x, y - std, '-', color='k', linewidth=0.1)
-
-        miny, maxy = min(y), max(y)
-
-        scores[DATASET][scoring_method + "_mm"].sort()
-        x, y, std, med = [], [], [], []
-        for k, g in groupby(scores[DATASET][scoring_method + "_mm"], key=itemgetter(0)):
-            i, s = zip(*list(g))
-            x.append(i[0])
-            y.append(sum(s) / len(s))
-            std.append(np.std(s))
-            med.append(np.median(s))
-
-        y.append(last_y)
-        x.append(last_x)
-
-        y = np.array(y)
-        std = np.array(std)
-
-        ax.plot(x, y, '-', color='g', linewidth=0.5)
-        #ax.plot(x, med, '--', color='g', linewidth=0.5)
-        #ax.plot(x, y + std, '-', color='g', linewidth=0.1)
-        #ax.plot(x, y - std, '-', color='g', linewidth=0.1)
-
-        miny = min(y) if min(y) < miny else miny
-        maxy = max(y) if max(y) > maxy else maxy
-
-        ax.set_yticks([round(miny - 0.005, 2), round(maxy + 0.005, 2)])
-        ax.set_ybound(round(miny - 0.005, 2), round(maxy + 0.005, 2))
-
-        for label in ax.get_xticklabels():
-            label.set_fontsize('xx-small')
-
-        for label in ax.get_yticklabels():
-            label.set_fontsize('xx-small')
-
-        #ax.set_ybound(0., 1.)
-        ax.set_xlabel('trees', size='small')
-        ax.set_ylabel(scoring_method.upper(), size='small')
-        subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
-        ax.set_title('%s' % subtitle, weight='bold', size='small')
-
-    counter = 1
-    for DATASET in sorted(scores.keys()):
-        if DATASET == "marketing":
-            continue
-
-        add_scores_plot(counter, DATASET)
-        counter += 1
-
-    ax = fig.add_subplot(4, 3, counter)
-
-    ax.plot([0], [0], 'k-', [0], [0], 'g-')
-    ax.set_axis_off()
-
-    plt.legend(["Random Forest", "Model-map-based forest"], frameon=False)
-    leg = plt.gca().get_legend()
-    ltext = leg.get_texts()
-    plt.setp(ltext, fontsize='xx-small')
-
-    fig.savefig(os.path.join(ROOT, "_ensemble_", "res_%s_%s.pdf" % (EXPMARKER, scoring_method)))
-
-def plot_one(DATASET, scoring_method, plot_model_map=True):
-    print "drawing plots..."
-    fig = plt.figure(figsize=(3, 2), dpi=300)
-    fig.subplots_adjust(wspace=0.4, hspace=0.6, top=0.9, bottom=0.15, left=0.15, right=0.96)
-
-    ax = fig.add_subplot(111)
-
-    scores[DATASET][scoring_method].sort()
-    x, y, std, med = [], [], [], []
-    for k, g in groupby(scores[DATASET][scoring_method], key=itemgetter(0)):
-        i, s = zip(*list(g))
-        x.append(i[0])
-        y.append(np.mean(s))
-        std.append(np.std(s))
-        med.append(np.median(s))
-
-    y = np.array(y)
-    std = np.array(std)
-
-    ax.plot(x, y, '-', color='k', linewidth=0.5, label='Random Forest')
-    if plot_model_map:
-        #ax.plot(x, y + std, '-', color='k', linewidth=0.1)
-        #ax.plot(x, y - std, '-', color='k', linewidth=0.1)
-        pass
-
-    miny, maxy = min(y), max(y)
-
-    if plot_model_map:
-        scores[DATASET][scoring_method + "_mm"].sort()
-        x, y, std, med = [], [], [], []
-        for k, g in groupby(scores[DATASET][scoring_method + "_mm"], key=itemgetter(0)):
-            i, s = zip(*list(g))
-            x.append(i[0])
-            y.append(sum(s) / len(s))
-            std.append(np.std(s))
-            med.append(np.median(s))
-
-        y = np.array(y)
-        std = np.array(std)
-
-        ax.plot(x, y, '-', color='g', linewidth=0.5, label='Model-map-based forest')
-        #ax.plot(x, y + std, '-', color='g', linewidth=0.1)
-        #ax.plot(x, y - std, '-', color='g', linewidth=0.1)
-
-        miny = min(y) if min(y) < miny else miny
-        maxy = max(y) if max(y) > maxy else maxy
-
-    ax.set_yticks([round(miny - 0.005, 2), round(maxy + 0.005, 2)-0.15])
-    ax.set_ybound(round(miny - 0.005, 2), round(maxy + 0.005, 2)-0.15)
-
-    for label in ax.get_xticklabels():
-        label.set_fontsize('xx-small')
-
-    for label in ax.get_yticklabels():
-        label.set_fontsize('xx-small')
-
-    if plot_model_map:
-        leg = plt.legend(frameon=False)
-        leg = plt.gca().get_legend()
-        ltext = leg.get_texts()
-        plt.setp(ltext, fontsize='x-small')
-
-    ax.set_xlabel('trees', size='small')
-    ax.set_ylabel(scoring_method.upper(), size='small')
-    subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
-    ax.set_title('%s' % subtitle, weight='bold', size='small')
-
-    fig.savefig(os.path.join(ROOT, "_ensemble_", "res_%s_%s_%s.pdf" % (EXPMARKER, DATASET, scoring_method)))
-
-EXPMARKER = "120_120_3fold_tree_base"
-DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
-
-for DATASET in DO:
-    fname =os.path.join(ROOT, "_ensemble_", "scores_%s_%s.pkl" % (EXPMARKER, DATASET))
-    print fname
-    if os.path.exists(fname) and os.path.isfile(fname):
-        scores.update(pickle.load(open(fname, "rb")))
-
-#plot_scores("ca")
-#plot_scores("auc")
-plot_scores("brier")
-
-plot_one("marketing", "brier", plot_model_map=True)
-#plot_one("zoo", "brier", plot_model_map=False)
-
-
-def best_scores(scores):
-    x = []
-    scores.sort()
-    for k, g in groupby(scores, key=itemgetter(0)):
-        i, s = zip(*list(g))
-        x.append((i[0], np.mean(s)))
-
-    best = min(x, key=itemgetter(1))[0]
-    print "best", best
-    return [s for t, s in scores if t == best]
-
-def average_scores(scores):
-    x = []
-    scores.sort()
-    for k, g in groupby(scores, key=itemgetter(0)):
-        i, s = zip(*list(g))
-        x.append(np.mean(s))
-
-    return np.array(x)
-
-x = average_scores(scores["marketing"]["brier"])
-y = average_scores(scores["marketing"]["brier_mm"])
-
-trim = min(len(x), len(y))
-print np.sum(x[:trim] - y[:trim])
-
-ranks = scipy.stats.mstats.rankdata(np.array([x[:trim], y[:trim]]), axis=0)
-print "\n".join("%s: %.3f" % (name, r) for r, name in zip(np.mean(ranks, axis=1), ["RF", "MM forest"]))
-print scipy.stats.wilcoxon(x[:trim], y[:trim])
-
-x = best_scores(scores["marketing"]["brier"])
-y = best_scores(scores["marketing"]["brier_mm"])
-
-print "Marketing"
-print "best RF:", max(x)
-print "best MM:", max(y)
-print "Student's t-test:", scipy.stats.ttest_ind(x, y)
-
-for scoring_method in ["ca", "auc", "brier"]:
-    print scoring_method.upper()
-    x, y = [], []
-    for DATASET in scores:
-
-        trim = min(len(list(average_scores(scores[DATASET][scoring_method]))), len(list(average_scores(scores[DATASET][scoring_method+ "_mm"]))))
-        x.extend(list(average_scores(scores[DATASET][scoring_method]))[:trim])
-        y.extend(list(average_scores(scores[DATASET][scoring_method + "_mm"]))[:trim])
-
-    print "Ranks"
-    ranks = scipy.stats.mstats.rankdata(np.array([x, y]), axis=0)
-    print "\n".join("%s: %.3f" % (name, r) for r, name in zip(np.mean(ranks, axis=1), ["RF", "MM forest"]))
-    print scipy.stats.wilcoxon(x, y)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

examples/ensemble/play_with_rf.py

-import matplotlib
-matplotlib.use('Agg')
-
-import math, os.path, random, re, sys
-import cPickle as pickle
-import matplotlib.pyplot as plt
-import numpy as np
-
-from itertools import groupby
-from operator import itemgetter
-
-from Orange import clustering, data, distance, ensemble, evaluation, network, utils
-from Orange.classification.tree import SimpleTreeLearner
-
-import orangecontrib.modelmaps as mm
-
-ROOT = "/home/miha/work/res/modelmaps"
-#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
-#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
-

examples/ensemble/rf_performance_build.py

+__author__ = '"Miha Stajdohar" <miha.stajdohar@gmail.com>'
+
+import matplotlib
+matplotlib.use('Agg')
+
+import os.path, sys
+import numpy as np
+import orangecontrib.modelmaps as mm
+#import cPickle as pickle
+
+from Orange import data, utils
+
+ROOT = "/home/miha/work/res/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+
+def build_rd_map(DATASET):
+    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+
+    if not (os.path.exists(fname) and os.path.isfile(fname)):
+        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            raise IOError("File %s not found." % fname)
+
+    build_map = mm.BuildModelMap(fname)
+
+    trees = 150
+
+    print "build models..."
+    models, models_1, rf_classifier, _ = build_map.build_rf_models(trees=trees, max_depth=None, three_folds=False)
+
+    print "build model data..."
+    table = build_map.build_model_data(models)
+    table_1 = build_map.build_model_data(models_1)
+
+    print "build matrix..."
+    smx = build_map.build_model_matrix(models)
+    smx_1 = build_map.build_model_matrix(models_1)
+    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_tree_base_%s" % (DATASET, len(models), sys.platform)), smx, table, build_map.data())
+    mm.save(os.path.join(ROOT, "_ensemble_", "rf_%s_%d_tree_base_%s" % (DATASET, len(models_1), sys.platform)), smx_1, table_1, build_map.data())
+
+#build_rd_map("zoo")
+#build_rd_map("marketing")
+#build_rd_map("vehicle")
+#build_rd_map("iris")
+#build_rd_map("voting")
+
+DO = ["iris", "breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "marketing", "primary-tumor", "vehicle", "wdbc", "dermatology"]
+DO = ["marketing"]
+
+for d in DO:
+    build_rd_map(d)

examples/ensemble/rf_performance_plot_1.py

+import matplotlib
+matplotlib.use('Agg')
+
+import os.path, random, re, sys
+import cPickle as pickle
+import matplotlib.pyplot as plt
+import numpy as np
+
+from itertools import groupby
+from operator import itemgetter
+
+from Orange import clustering, distance, evaluation, utils
+
+import orangecontrib.modelmaps as mm
+
+ROOT = "/home/miha/work/res/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+
+scores = {}
+cache = {}
+
+NTREES = 1000
+MAXCLUSTERS = 120
+EXPMARKER = "1000_120_2fold_tree_base"
+
+def plot_scores(DATASET):
+    print "drawing plots..."
+    fig = plt.figure(figsize=(6, 8), dpi=300)
+    fig.subplots_adjust(wspace=0.0, hspace=0.6, top=0.95, bottom=0.06, left=0.1, right=0.97)
+
+    def add_scores_plot(i, type):
+
+        ax = fig.add_subplot(3, 1, i)
+
+        scores[DATASET][type].sort()
+        x, y = [], []
+        for k, g in groupby(scores[DATASET][type], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+
+        ax.plot(x, y, '-', color='k', linewidth=1)
+
+        scores[DATASET][type + "_mm"].sort()
+        x, y = [], []
+        for k, g in groupby(scores[DATASET][type + "_mm"], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+
+        ax.plot(x, y, '--', color='g', linewidth=0.5)
+
+        for label in ax.get_xticklabels():
+            label.set_fontsize('small')
+
+        for label in ax.get_yticklabels():
+            label.set_fontsize('small')
+
+        ax.set_ybound(0., 1.)
+
+        ax.set_xlabel('trees')
+        ax.set_ylabel(type.upper())
+        subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
+        ax.set_title('%s' % subtitle, weight='bold')
+
+    add_scores_plot(0, "ca")
+    add_scores_plot(1, "auc")
+    add_scores_plot(2, "brier")
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%s.png" % (EXPMARKER, DATASET)))
+
+
+def score_trees(rf_classifier, trees, test, DATASET, cluster_indices=None):
+
+    score = scores.get(DATASET, {"ca": [], "auc": [], "brier": [], "ca_mm": [], "auc_mm": [], "brier_mm": []})
+
+    if cluster_indices is None:
+        models = [random.sample(trees, i) for i in range(2, min(MAXCLUSTERS * 2, len(trees)))]
+    else:
+        models = [[trees[random.choice(cluster)] for cluster in clustering] for clustering in cluster_indices]
+
+    for trees in models:
+        rf_classifier.classifiers = trees
+        classifiers = [rf_classifier]
+
+        test_results = evaluation.testing.ExperimentResults(1,
+            classifier_names = ["RF"],
+            domain=test.domain,
+            test_type = evaluation.testing.TEST_TYPE_SINGLE,
+            weights=0)
+
+        test_results.results = [test_results.create_tested_example(0, example) for j, example in enumerate(test)]
+
+        results = evaluation.testing._default_evaluation._test_on_data(classifiers, test)
+
+        for example, classifier, result in results:
+            test_results.results[example].set_result(classifier, *result)
+
+        score["ca_mm" if cluster_indices else "ca"].append((len(trees), evaluation.scoring.CA(test_results)[0]))
+        score["auc_mm" if cluster_indices else "auc"].append((len(trees), evaluation.scoring.AUC(test_results)[0]))
+        score["brier_mm" if cluster_indices else "brier"].append((len(trees), evaluation.scoring.Brier_score(test_results)[0]))
+
+    scores[DATASET] = score
+
+def plot_trees_score(DATASET, depth=None, seed=0):
+
+    print "DATASET:", DATASET
+
+    if not DATASET in cache:
+        fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+            if not (os.path.exists(fname) and os.path.isfile(fname)):
+                raise IOError("File %s not found." % fname)
+
+        build_map = mm.BuildModelMap(fname)
+
+        print "build models..."
+        models, rf_classifier, test = build_map.build_rf_models(trees=NTREES, max_depth=depth, three_folds=False)
+
+        print "build model data..."
+        table = build_map.build_model_data(models)
+
+        #print "build matrix..."
+        #smx = build_map.build_model_matrix(models)
+
+        class ModelDistanceConstructor(distance.DistanceConstructor):
+
+            def __new__(cls, data=None):
+                self = distance.DistanceConstructor.__new__(cls)
+                return self.__call__(data) if data else self
+
+            def __call__(self, table):
+                return ModelDistance()
+
+        class ModelDistance(distance.Distance):
+            def __call__(self, e1, e2):
+                return mm.distance_manhattan(e1["model"].value, e2["model"].value)
+
+        def data_center(table, original=table):
+            if len(table) == 0:
+                print "e",
+                sys.stdout.flush()
+                table = original
+            onemodel = table[0]["model"].value
+            model = mm.Model("RF", None,
+                np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
+                onemodel.class_values, [],
+                [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
+                [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
+
+            return model.get_instance(table.domain)
+
+        table.random_generator = seed
+        clustering.kmeans.data_center = data_center
+
+        table.shuffle()
+        cluster_indices = []
+
+        print "kmeans..."
+        for c in range(2, MAXCLUSTERS + 1):
+            print c,
+            sys.stdout.flush()
+            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+
+            cluster_indices_tmp = []
+            for k, g in groupby(clusters_, key=itemgetter(0)):
+                _, indices = zip(*list(g))
+                cluster_indices_tmp.append(indices)
+
+            cluster_indices.append(cluster_indices_tmp)
+
+        trees = [ex["model"].value.classifier for ex in table]
+
+        cache[DATASET] = {}
+        cache[DATASET]["rf"] = rf_classifier
+        cache[DATASET]["trees"] = trees
+        cache[DATASET]["test"] = test
+        cache[DATASET]["cluster_indices"] = cluster_indices
+    else:
+        rf_classifier = cache[DATASET]["rf"]
+        trees = cache[DATASET]["trees"]
+        test = cache[DATASET]["test"]
+        cluster_indices = cache[DATASET]["cluster_indices"]
+
+    print "score trees in RF..."
+    score_trees(rf_classifier, trees, test, DATASET)
+
+    print "score trees in MM..."
+    score_trees(rf_classifier, trees, test, DATASET, cluster_indices)
+
+    print "pickle scores...",
+    sys.stdout.flush()
+    pickle.dump(scores, open(os.path.join(ROOT, "_ensemble_", "scores_%s_%s.pkl" % (EXPMARKER, DATASET)), "wb"), -1)
+    print "done"
+
+    #plot_scores(DATASET)
+
+
+if __name__ == "__main__":
+    DO = sys.argv[1:]
+
+    if len(DO) == 0:
+        DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
+        DO = ["breast-cancer-wisconsin", "iris"]
+
+    for i in range(500):
+        print i
+        for d in DO:
+            plot_trees_score(d)

examples/ensemble/rf_performance_plot_2.py

+import matplotlib
+matplotlib.use('Agg')
+
+import os.path, re
+import cPickle as pickle
+import matplotlib.pyplot as plt
+import scipy.stats
+import numpy as np
+
+from itertools import groupby
+from operator import itemgetter
+
+ROOT = "/Users/miha/work/res/modelmaps"
+#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+
+scores = {}
+
+def plot_scores(scoring_method):
+    print "drawing plots..."
+    fig = plt.figure(figsize=(6, 8), dpi=300)
+    fig.subplots_adjust(wspace=0.4, hspace=0.6, top=0.976, bottom=0.017, left=0.074, right=0.983)
+
+    def add_scores_plot(i, DATASET):
+
+        ax = fig.add_subplot(4, 3, i)
+
+        scores[DATASET][scoring_method].sort()
+        x, y, std, med = [], [], [], []
+        for k, g in groupby(scores[DATASET][scoring_method], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(np.mean(s))
+            std.append(np.std(s))
+            med.append(np.median(s))
+
+        last_x = x[-1]
+        last_y = y[-1]
+
+        y = np.array(y)
+        std = np.array(std)
+
+        ax.plot(x, y, '-', color='k', linewidth=0.5)
+        #ax.plot(x, med, '--', color='k', linewidth=0.5)
+        #ax.plot(x, y + std, '-', color='k', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='k', linewidth=0.1)
+
+        miny, maxy = min(y), max(y)
+
+        scores[DATASET][scoring_method + "_mm"].sort()
+        x, y, std, med = [], [], [], []
+        for k, g in groupby(scores[DATASET][scoring_method + "_mm"], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+            std.append(np.std(s))
+            med.append(np.median(s))
+
+        y.append(last_y)
+        x.append(last_x)
+
+        y = np.array(y)
+        std = np.array(std)
+
+        ax.plot(x, y, '-', color='g', linewidth=0.5)
+        #ax.plot(x, med, '--', color='g', linewidth=0.5)
+        #ax.plot(x, y + std, '-', color='g', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='g', linewidth=0.1)
+
+        miny = min(y) if min(y) < miny else miny
+        maxy = max(y) if max(y) > maxy else maxy
+
+        ax.set_yticks([round(miny - 0.005, 2), round(maxy + 0.005, 2)])
+        ax.set_ybound(round(miny - 0.005, 2), round(maxy + 0.005, 2))
+
+        for label in ax.get_xticklabels():
+            label.set_fontsize('xx-small')
+
+        for label in ax.get_yticklabels():
+            label.set_fontsize('xx-small')
+
+        #ax.set_ybound(0., 1.)
+        ax.set_xlabel('trees', size='small')
+        ax.set_ylabel(scoring_method.upper(), size='small')
+        subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
+        ax.set_title('%s' % subtitle, weight='bold', size='small')
+
+    counter = 1
+    for DATASET in sorted(scores.keys()):
+        if DATASET == "marketing":
+            continue
+
+        add_scores_plot(counter, DATASET)
+        counter += 1
+
+    ax = fig.add_subplot(4, 3, counter)
+
+    ax.plot([0], [0], 'k-', [0], [0], 'g-')
+    ax.set_axis_off()
+
+    plt.legend(["Random Forest", "Model-map-based forest"], frameon=False)
+    leg = plt.gca().get_legend()
+    ltext = leg.get_texts()
+    plt.setp(ltext, fontsize='xx-small')
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "res_%s_%s.pdf" % (EXPMARKER, scoring_method)))
+
+def plot_one(DATASET, scoring_method, plot_model_map=True):
+    print "drawing plots..."
+    fig = plt.figure(figsize=(3, 2), dpi=300)
+    fig.subplots_adjust(wspace=0.4, hspace=0.6, top=0.9, bottom=0.15, left=0.15, right=0.96)
+
+    ax = fig.add_subplot(111)
+
+    scores[DATASET][scoring_method].sort()
+    x, y, std, med = [], [], [], []
+    for k, g in groupby(scores[DATASET][scoring_method], key=itemgetter(0)):
+        i, s = zip(*list(g))
+        x.append(i[0])
+        y.append(np.mean(s))
+        std.append(np.std(s))
+        med.append(np.median(s))
+
+    y = np.array(y)
+    std = np.array(std)
+
+    ax.plot(x, y, '-', color='k', linewidth=0.5, label='Random Forest')
+    if plot_model_map:
+        #ax.plot(x, y + std, '-', color='k', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='k', linewidth=0.1)
+        pass
+
+    miny, maxy = min(y), max(y)
+
+    if plot_model_map:
+        scores[DATASET][scoring_method + "_mm"].sort()
+        x, y, std, med = [], [], [], []
+        for k, g in groupby(scores[DATASET][scoring_method + "_mm"], key=itemgetter(0)):
+            i, s = zip(*list(g))
+            x.append(i[0])
+            y.append(sum(s) / len(s))
+            std.append(np.std(s))
+            med.append(np.median(s))
+
+        y = np.array(y)
+        std = np.array(std)
+
+        ax.plot(x, y, '-', color='g', linewidth=0.5, label='Model-map-based forest')
+        #ax.plot(x, y + std, '-', color='g', linewidth=0.1)
+        #ax.plot(x, y - std, '-', color='g', linewidth=0.1)
+
+        miny = min(y) if min(y) < miny else miny
+        maxy = max(y) if max(y) > maxy else maxy
+
+    ax.set_yticks([round(miny - 0.005, 2), round(maxy + 0.005, 2)-0.15])
+    ax.set_ybound(round(miny - 0.005, 2), round(maxy + 0.005, 2)-0.15)
+
+    for label in ax.get_xticklabels():
+        label.set_fontsize('xx-small')
+
+    for label in ax.get_yticklabels():
+        label.set_fontsize('xx-small')
+
+    if plot_model_map:
+        leg = plt.legend(frameon=False)
+        leg = plt.gca().get_legend()
+        ltext = leg.get_texts()
+        plt.setp(ltext, fontsize='x-small')
+
+    ax.set_xlabel('trees', size='small')
+    ax.set_ylabel(scoring_method.upper(), size='small')
+    subtitle = " ".join([s[0].upper() + s[1:].lower() for s in re.split("_|-", DATASET) if s != "sample"])
+    ax.set_title('%s' % subtitle, weight='bold', size='small')
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "res_%s_%s_%s.pdf" % (EXPMARKER, DATASET, scoring_method)))
+
+EXPMARKER = "120_120_3fold_tree_base"
+DO = ["breast-cancer-wisconsin", "voting", "zoo", "mushroom", "adult_sample", "glass", "primary-tumor", "vehicle", "wdbc", "dermatology", "iris", "marketing"]
+
+for DATASET in DO:
+    fname =os.path.join(ROOT, "_ensemble_", "scores_%s_%s.pkl" % (EXPMARKER, DATASET))
+    print fname
+    if os.path.exists(fname) and os.path.isfile(fname):
+        scores.update(pickle.load(open(fname, "rb")))
+
+#plot_scores("ca")
+#plot_scores("auc")
+plot_scores("brier")
+
+plot_one("marketing", "brier", plot_model_map=True)
+#plot_one("zoo", "brier", plot_model_map=False)
+
+
+def best_scores(scores):
+    x = []
+    scores.sort()
+    for k, g in groupby(scores, key=itemgetter(0)):
+        i, s = zip(*list(g))
+        x.append((i[0], np.mean(s)))
+
+    best = min(x, key=itemgetter(1))[0]
+    print "best", best
+    return [s for t, s in scores if t == best]
+
+def average_scores(scores):
+    x = []
+    scores.sort()
+    for k, g in groupby(scores, key=itemgetter(0)):
+        i, s = zip(*list(g))
+        x.append(np.mean(s))
+
+    return np.array(x)
+
+x = average_scores(scores["marketing"]["brier"])
+y = average_scores(scores["marketing"]["brier_mm"])
+
+trim = min(len(x), len(y))
+print np.sum(x[:trim] - y[:trim])
+
+ranks = scipy.stats.mstats.rankdata(np.array([x[:trim], y[:trim]]), axis=0)
+print "\n".join("%s: %.3f" % (name, r) for r, name in zip(np.mean(ranks, axis=1), ["RF", "MM forest"]))
+print scipy.stats.wilcoxon(x[:trim], y[:trim])
+
+x = best_scores(scores["marketing"]["brier"])
+y = best_scores(scores["marketing"]["brier_mm"])
+
+print "Marketing"
+print "best RF:", max(x)
+print "best MM:", max(y)
+print "Student's t-test:", scipy.stats.ttest_ind(x, y)
+
+for scoring_method in ["ca", "auc", "brier"]:
+    print scoring_method.upper()
+    x, y = [], []
+    for DATASET in scores:
+
+        trim = min(len(list(average_scores(scores[DATASET][scoring_method]))), len(list(average_scores(scores[DATASET][scoring_method+ "_mm"]))))
+        x.extend(list(average_scores(scores[DATASET][scoring_method]))[:trim])
+        y.extend(list(average_scores(scores[DATASET][scoring_method + "_mm"]))[:trim])
+
+    print "Ranks"
+    ranks = scipy.stats.mstats.rankdata(np.array([x, y]), axis=0)
+    print "\n".join("%s: %.3f" % (name, r) for r, name in zip(np.mean(ranks, axis=1), ["RF", "MM forest"]))
+    print scipy.stats.wilcoxon(x, y)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

examples/ensemble/rf_performance_plot_3.py

+import matplotlib
+matplotlib.use('Agg')
+
+import os.path
+import cPickle as pickle
+import matplotlib.pyplot as plt
+
+from Orange import data, ensemble, evaluation, utils
+from Orange.classification.tree import SimpleTreeLearner
+
+ROOT = "/home/miha/work/res/modelmaps"
+#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+
+class SimpleTreeLearnerSetProb():
+    """
+    Orange.classification.tree.SimpleTreeLearner which sets the skip_prob
+    so that on average a square root of the attributes will be
+    randomly choosen for each split.
+    """
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+
+    def __call__(self, examples, weight=0):
+        self.wrapped.skip_prob = 1-len(examples.domain.attributes)**0.5/len(examples.domain.attributes)
+        return self.wrapped(examples)
+
+def pickle_scores(DATASET, a=0, b=50, step=1, depth=2, min_instances=5):
+    print "Building trees:", DATASET
+    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+
+    if not (os.path.exists(fname) and os.path.isfile(fname)):
+        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            raise IOError("File %s not found." % fname)
+
+    dataset = data.Table(fname)
+
+    indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(dataset)
+    train = dataset.select(indices, 0)
+    test = dataset.select(indices, 1)
+
+    cas = []
+    aucs = []
+    briers = []
+
+    for trees in range(a, b, step):
+        # uses gain ratio
+        #tree = SimpleTreeLearnerSetProb(SimpleTreeLearner(max_depth=depth, min_instances=min_instances))
+        #rf_learner = ensemble.forest.RandomForestLearner(learner=tree, trees=trees, name="RF: %d trees; max depth: %d; min instances: %d" % (trees, depth, min_instances))
+        rf_learner = ensemble.forest.RandomForestLearner(trees=trees, name="RF: %d trees; max depth: None; min instances: %d" % (trees, min_instances))
+        #rf_classifier = rf_learner(train)
+
+        learners = [rf_learner]
+
+        results = evaluation.testing.learn_and_test_on_test_data(learners, train ,test)
+
+        cas.append(evaluation.scoring.CA(results)[0])
+        aucs.append(evaluation.scoring.AUC(results)[0])
+        briers.append(evaluation.scoring.Brier_score(results)[0])
+
+        #print "%d, %-8s %5.6f  %5.6f  %5.6f" % (trees, learners[0].name, cas[-1], aucs[-1], briers[-1])
+
+    pickle.dump((cas, aucs, briers), open(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_depth_%s_instances_%d.pkl" % (DATASET, a, b, depth, min_instances)), "wb"), -1)
+
+pickle_scores("zoo", a=2, b=300, depth=None)
+pickle_scores("marketing", a=2, b=300, depth=None)
+pickle_scores("vehicle", a=2, b=300, depth=None)
+pickle_scores("iris", a=2, b=300, depth=None)
+pickle_scores("voting", a=2, b=300, depth=None)
+
+def plot_auc(DATASET, a=2, b=500, depth=2, min_instances=5):
+    print "Drawing plots:", DATASET
+    cas, aucs, briers = pickle.load(open(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_depth_%s_instances_%d.pkl" % (DATASET, a, b, depth, min_instances)), "rb"))
+
+    fig = plt.figure(figsize=(3, 6), dpi=300)
+    fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)
+
+    ax = fig.add_subplot(3, 1, 1)
+
+    ax.plot(range(len(aucs)), aucs, '-')
+
+    ax.set_xlabel('trees')
+    ax.set_ylabel('AUC')
+    ax.set_title('RF: %s' % DATASET)
+
+    ax = fig.add_subplot(3, 1, 2)
+
+    ax.plot(range(len(cas)), cas, '-')
+
+    ax.set_xlabel('trees')
+    ax.set_ylabel('CA')
+    ax.set_title('RF: %s' % DATASET)
+
+    ax = fig.add_subplot(3, 1, 3)
+
+    ax.plot(range(len(briers)), briers, '-')
+
+    ax.set_xlabel('trees')
+    ax.set_ylabel('Brier')
+    ax.set_title('RF: %s' % DATASET)
+
+    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_depth_%s.png" % (DATASET, a, b, depth)))
+
+plot_auc("zoo", a=2, b=300, depth=None)
+plot_auc("marketing", a=2, b=300, depth=None)
+plot_auc("vehicle", a=2, b=300, depth=None)
+plot_auc("iris", a=2, b=300, depth=None)
+plot_auc("voting", a=2, b=300, depth=None)

examples/ensemble/trees_score.py

-import matplotlib
-matplotlib.use('Agg')
-
-import os.path
-import cPickle as pickle
-import matplotlib.pyplot as plt
-
-from Orange import data, ensemble, evaluation, utils
-from Orange.classification.tree import SimpleTreeLearner
-
-ROOT = "/home/miha/work/res/modelmaps"
-#ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
-#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
-
-class SimpleTreeLearnerSetProb():
-    """
-    Orange.classification.tree.SimpleTreeLearner which sets the skip_prob
-    so that on average a square root of the attributes will be
-    randomly choosen for each split.
-    """
-    def __init__(self, wrapped):
-        self.wrapped = wrapped
-
-    def __call__(self, examples, weight=0):
-        self.wrapped.skip_prob = 1-len(examples.domain.attributes)**0.5/len(examples.domain.attributes)
-        return self.wrapped(examples)
-
-def pickle_scores(DATASET, a=0, b=50, step=1, depth=2, min_instances=5):
-    print "Building trees:", DATASET
-    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
-
-    if not (os.path.exists(fname) and os.path.isfile(fname)):
-        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
-
-        if not (os.path.exists(fname) and os.path.isfile(fname)):
-            raise IOError("File %s not found." % fname)
-
-    dataset = data.Table(fname)
-
-    indices = data.sample.SubsetIndices2(p0=0.5, stratified=data.sample.SubsetIndices.Stratified, randseed=42)(dataset)
-    train = dataset.select(indices, 0)
-    test = dataset.select(indices, 1)
-
-    cas = []
-    aucs = []
-    briers = []
-
-    for trees in range(a, b, step):
-        # uses gain ratio
-        #tree = SimpleTreeLearnerSetProb(SimpleTreeLearner(max_depth=depth, min_instances=min_instances))
-        #rf_learner = ensemble.forest.RandomForestLearner(learner=tree, trees=trees, name="RF: %d trees; max depth: %d; min instances: %d" % (trees, depth, min_instances))
-        rf_learner = ensemble.forest.RandomForestLearner(trees=trees, name="RF: %d trees; max depth: None; min instances: %d" % (trees, min_instances))
-        #rf_classifier = rf_learner(train)
-
-        learners = [rf_learner]
-
-        results = evaluation.testing.learn_and_test_on_test_data(learners, train ,test)
-
-        cas.append(evaluation.scoring.CA(results)[0])
-        aucs.append(evaluation.scoring.AUC(results)[0])
-        briers.append(evaluation.scoring.Brier_score(results)[0])
-
-        #print "%d, %-8s %5.6f  %5.6f  %5.6f" % (trees, learners[0].name, cas[-1], aucs[-1], briers[-1])
-
-    pickle.dump((cas, aucs, briers), open(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_depth_%s_instances_%d.pkl" % (DATASET, a, b, depth, min_instances)), "wb"), -1)
-
-pickle_scores("zoo", a=2, b=300, depth=None)
-pickle_scores("marketing", a=2, b=300, depth=None)
-pickle_scores("vehicle", a=2, b=300, depth=None)
-pickle_scores("iris", a=2, b=300, depth=None)
-pickle_scores("voting", a=2, b=300, depth=None)
-
-def plot_auc(DATASET, a=2, b=500, depth=2, min_instances=5):
-    print "Drawing plots:", DATASET
-    cas, aucs, briers = pickle.load(open(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_depth_%s_instances_%d.pkl" % (DATASET, a, b, depth, min_instances)), "rb"))
-
-    fig = plt.figure(figsize=(3, 6), dpi=300)
-    fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)
-
-    ax = fig.add_subplot(3, 1, 1)
-
-    ax.plot(range(len(aucs)), aucs, '-')
-
-    ax.set_xlabel('trees')
-    ax.set_ylabel('AUC')
-    ax.set_title('RF: %s' % DATASET)
-
-    ax = fig.add_subplot(3, 1, 2)
-
-    ax.plot(range(len(cas)), cas, '-')
-
-    ax.set_xlabel('trees')
-    ax.set_ylabel('CA')
-    ax.set_title('RF: %s' % DATASET)
-
-    ax = fig.add_subplot(3, 1, 3)
-
-    ax.plot(range(len(briers)), briers, '-')
-
-    ax.set_xlabel('trees')
-    ax.set_ylabel('Brier')
-    ax.set_title('RF: %s' % DATASET)
-
-    fig.savefig(os.path.join(ROOT, "_ensemble_", "scores_%s_%d_to_%d_depth_%s.png" % (DATASET, a, b, depth)))
-
-plot_auc("zoo", a=2, b=300, depth=None)
-plot_auc("marketing", a=2, b=300, depth=None)
-plot_auc("vehicle", a=2, b=300, depth=None)
-plot_auc("iris", a=2, b=300, depth=None)
-plot_auc("voting", a=2, b=300, depth=None)