Commits

Miha Stajdohar committed e49e39e

VizRank vs Model Map analysis.

  • Participants
  • Parent commits baeaabc

Comments (0)

Files changed (4)

File examples/projections/distance_histograms.py

+import cPickle as pickle
+import pylab, os.path
+
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps\\_projections_"
+
+vals = pickle.load(open(os.path.join(ROOT, "hist_vals.pkl"), "rb"))
+
+n, bins, patches = pylab.hist(vals["zoo class"], 200, normed=1, histtype='stepfilled')
+
+pylab.figure()
+
+n, bins, patches = pylab.hist(vals["zoo mann"], 200, normed=1, histtype='stepfilled')
+
+pylab.figure()
+
+n, bins, patches = pylab.hist(vals["breast class"], 200, normed=1, histtype='stepfilled')
+
+pylab.figure()
+
+n, bins, patches = pylab.hist(vals["breast mann"], 200, normed=1, histtype='stepfilled')
+
+pylab.show()

File examples/projections/distance_metric_comparisson.py

+__author__ = 'Miha Stajdohar'
+
+import cPickle as pickle
+import os.path, sys
+import scipy
+import numpy as np
+import _modelmaps as mm
+
+from time import time
+from Orange.orng import orngVizRank as vr
+from Orange import utils
+
+ROOT = "/home/miha/work/res/modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+
+def build_map_for_metric_comparisson(DATASET, N):
+    print "DATA SET: %s" % DATASET
+
+    build_map = mm.BuildModelMap(os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab")))
+
+    nfeatures = len(build_map.data_d.domain.features)
+    features = mm.get_feature_subsets(build_map.data_d.domain, N)
+
+    max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
+    features_scatterplot = mm.get_feature_subsets_scatterplot(build_map.data_d.domain, max_nfeatures_scatterplot)
+
+    models = []
+    models.extend([build_map.build_projection_model(f, vr.LINEAR_PROJECTION) for f in features])
+    models.extend([build_map.build_projection_model(f, vr.RADVIZ) for f in features])
+    models.extend([build_map.build_projection_model(f, vr.POLYVIZ) for f in features])
+    models.extend([build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot])
+
+    table = build_map.build_model_data(models)
+
+    smxs = {}
+    s = time()
+    smxs["5.1"] = build_map.build_model_matrix(models, mm.distance_class)
+    print (time() - s) / 6
+#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)
+
+    s = time()
+    smxs["5.3"] = build_map.build_model_matrix(models, mm.distance_euclidean)
+    print (time() - s) / 60
+#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)
+
+    s = time()
+    smxs["5.4"] = build_map.build_model_matrix(models, mm.distance_manhattan)
+    print (time() - s) / 60
+#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)
+
+    s = time()
+    smxs["5.5"] = build_map.build_model_matrix(models, mm.distance_rank)
+    print (time() - s) / 60
+#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)
+
+    s = time()
+    smxs["5.2"] = build_map.build_model_matrix(models, mm.distance_mi)
+    print (time() - s) / 60
+#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)
+
+#    r_file = os.path.join(ROOT, "_projections_", "vals_%s.pkl" % sys.platform)
+#    if os.path.exists(r_file) and os.path.isfile(r_file):
+#        res = pickle.load(open(r_file, "rb"))
+#    else:
+#        res = {}
+#
+#    res.update({"%s %s" % (DATASET, key): smxs[key].get_values() for key in smxs})
+#    pickle.dump(res, open(r_file, "wb"), -1)
+
+    return smxs
+
+def matrix_correlation(smxs):
+    keys = sorted(smxs.keys())
+    res = {k1: {k2: {} for k2 in keys} for k1 in keys}
+    for i in range(len(smxs)):
+        for j in range(i + 1):
+            r = {}
+            smx1 = smxs[keys[i]]
+            smx2 = smxs[keys[j]]
+
+            r["rank"], r["rank p"] = np.average([scipy.stats.spearmanr(smx1[n], smx2[n]) for n in range(smx1.dim)], axis=0)
+
+            res[keys[i]][keys[j]] = r
+            res[keys[j]][keys[i]] = r
+
+    #pickle.dump(res, open(os.path.join(ROOT, "_projections_", "compare_distances.pkl"), "wb"), -1)
+
+    print "rank"
+    for i in range(len(smxs)):
+        print keys[i], "  ",
+        print "  ".join(["%s: %lf" % (keys[j], res[keys[i]][keys[j]]["rank"]) for j in range(i)])
+    print
+    print "rank p"
+    for i in range(len(smxs)):
+        print keys[i], "  ",
+        print "  ".join(["%s: %e" % (keys[j], res[keys[i]][keys[j]]["rank p"]) for j in range(i)])
+
+smxs = build_map_for_metric_comparisson("zoo", N=1000)
+#smxs, table, data = mm.load(os.path.join(ROOT, "_projections_", "proj_alldist_4_zoo_1000"))
+matrix_correlation(smxs)
+
+#smxs = build_map_for_metric_comparisson("wine", N=1000)
+#matrix_correlation(smxs)
+
+#smxs = build_map_for_metric_comparisson("voting", N=1000)
+#matrix_correlation(smxs)
+
+#smxs = build_map_for_metric_comparisson("vehicle", N=1000)
+#matrix_correlation(smxs)
+
+#smxs = build_map_for_metric_comparisson("iris", N=10)
+#matrix_correlation(smxs)
+
+#smxs = build_map_for_metric_comparisson("heart_disease", N=1000)
+#matrix_correlation(smxs)
+
+smxs = build_map_for_metric_comparisson("breast-cancer-wisconsin-cont", N=501)
+matrix_correlation(smxs)

File examples/projections/radviz.py

+import os.path
+import numpy as np
+import matplotlib.pyplot as plt
+import _modelmaps as mm
+
+from itertools import groupby
+from operator import itemgetter
+from Orange import clustering, data, distance, utils
+from Orange.orng import orngVizRank as vr
+
+ROOT = "/home/miha/work/res/modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+
+def radviz_in_vr_mm(DATASET, centroids):
+    build_map = mm.BuildModelMap(os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab")))
+    nfeatures = len(build_map.data_d.domain.features)
+
+    features = mm.get_feature_subsets(build_map.data().domain, min_features=3, max_features=3)
+
+
+    models = []
+    models.extend(build_map.build_projection_model(attrs, vr.RADVIZ) for attrs in features)
+    table = build_map.build_model_data(models)
+
+    # VIZRANK
+
+    def save_figure(model_instances, method):
+        fig = plt.figure(figsize=(6, 9), dpi=300)
+        fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)
+
+        for i, (score, attr) in enumerate(scored_attributes):
+            add_subplot(fig, score, attr, i=(i + 1))
+
+        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
+        plt.savefig(os.path.join(ROOT, "_projections_", "radviz_%s_%s.pdf" % (DATASET, method.lower().replace(" ", ""))))
+
+    def add_subplot(fig, score, attrs, i=1):
+        graph = data.preprocess.scaling.ScaleScatterPlotData()
+        graph.setData(build_map.data(), graph.rawSubsetData)
+        attr_indices = [graph.attribute_name_index[attr] for attr in attrs]
+        selected_data = np.take(graph.scaled_data, attr_indices, axis=0)
+        class_list = graph.original_data[graph.data_class_index]
+
+        ax = fig.add_subplot(3, 2, i)
+
+        x_dom = set(selected_data[0])
+        if len(x_dom) < 10:
+            ax.set_xticklabels(list(set(selected_data[0])), size='x-small')
+        else:
+            for label in ax.get_xticklabels():
+                label.set_fontsize('x-small')
+
+        y_dom = set(selected_data[1])
+        if len(y_dom) < 10:
+            ax.set_yticklabels(list(set(selected_data[1])), size='x-small')
+        else:
+            for label in ax.get_yticklabels():
+                label.set_fontsize('x-small')
+
+        ax.scatter(selected_data[0], selected_data[1], c=class_list, s=50., alpha=0.75)
+
+        ax.set_xlabel(attrs[0], size='small')
+        ax.set_ylabel(attrs[1], size='small')
+
+        ax.set_title(r"$\overline{P}=%.2f$" % (score*100), weight='bold', size='medium', position=(0.5, 1.1),
+                        horizontalalignment='center', verticalalignment='center')
+
+    scored = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)
+    save_figure(scored[:6], "VizRank")
+
+
+    # MODEL MAP
+
+#    class ModelDistanceConstructor(distance.DistanceConstructor):
+#
+#        def __new__(cls, data=None):
+#            self = distance.DistanceConstructor.__new__(cls)
+#            return self.__call__(data) if data else self
+#
+#        def __call__(self, table):
+#            return ModelDistance()
+#
+#    class ModelDistance(distance.Distance):
+#        def __call__(self, e1, e2):
+#            return mm.distance_manhattan(e1["model"].value, e2["model"].value)
+#
+#    def data_center(table):
+#        onemodel = table[0]["model"].value
+#        model = mm.Model("SCATTERPLOT", None,
+#            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
+#            onemodel.class_values, [],
+#            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
+#            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
+#
+#        return model.get_instance(table.domain)
+#
+#    clustering.kmeans.data_center = data_center
+#    kmeans = clustering.kmeans.Clustering(table, centroids=centroids, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+#
+#    clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+#
+#    best_projs = []
+#    for k, g in groupby(clusters, key=itemgetter(0)):
+#        best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+#
+#    best_projs.sort(key=itemgetter(0), reverse=True)
+#    scored = [(score, table[key]["attributes"].value.split(", ")) for score, key in best_projs]
+#
+#    save_figure(scored[:6], "Model Map")
+
+
+    # SAVE MODEL MAP
+
+    smx = build_map.build_model_matrix(models)
+    mm.save(os.path.join(ROOT, "_projections_", "radviz_%s" % DATASET), smx, table, build_map.data())
+
+radviz_in_vr_mm("zoo", 10)
+#radviz_in_vr_mm("vehicle", 6)

File examples/projections/scatterplot.py

+import os.path
+import numpy as np
+import matplotlib.pyplot as plt
+import _modelmaps as mm
+
+from itertools import groupby
+from operator import itemgetter
+from Orange import clustering, data, distance, utils
+from Orange.orng import orngVizRank as vr
+
+ROOT = "/home/miha/work/res/modelmaps"
+#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
+ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
+
+def scatterplots_in_vr_mm(DATASET, centroids):
+    build_map = mm.BuildModelMap(os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab")))
+    nfeatures = len(build_map.data_d.domain.features)
+
+    max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
+    features_scatterplot = mm.get_feature_subsets_scatterplot(build_map.data().domain, max_nfeatures_scatterplot)
+
+
+    models = []
+    models.extend(build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot)
+    table = build_map.build_model_data(models)
+
+    # VIZRANK
+
+    def save_figure(scored_attributes, method):
+        fig = plt.figure(figsize=(6, 9), dpi=300)
+        fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)
+
+        for i, (score, attr) in enumerate(scored_attributes):
+            add_subplot(fig, score, attr, i=(i + 1))
+
+        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
+        plt.savefig(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s.pdf" % (DATASET, method.lower().replace(" ", ""))))
+
+    def add_subplot(fig, score, attrs, i=1):
+        graph = data.preprocess.scaling.ScaleScatterPlotData()
+        graph.setData(build_map.data(), graph.rawSubsetData)
+        attr_indices = [graph.attribute_name_index[attr] for attr in attrs]
+        selected_data = np.take(graph.scaled_data, attr_indices, axis=0)
+        class_list = graph.original_data[graph.data_class_index]
+
+        ax = fig.add_subplot(3, 2, i)
+
+        x_dom = set(selected_data[0])
+        if len(x_dom) < 10:
+            ax.set_xticklabels(list(set(selected_data[0])), size='x-small')
+        else:
+            for label in ax.get_xticklabels():
+                label.set_fontsize('x-small')
+
+        y_dom = set(selected_data[1])
+        if len(y_dom) < 10:
+            ax.set_yticklabels(list(set(selected_data[1])), size='x-small')
+        else:
+            for label in ax.get_yticklabels():
+                label.set_fontsize('x-small')
+
+        ax.scatter(selected_data[0], selected_data[1], c=class_list, s=50., alpha=0.75)
+
+        ax.set_xlabel(attrs[0], size='small')
+        ax.set_ylabel(attrs[1], size='small')
+
+        ax.set_title(r"$\overline{P}=%.2f$" % (score*100), weight='bold', size='medium', position=(0.5, 1.1),
+                        horizontalalignment='center', verticalalignment='center')
+
+    scored = sorted(((ex["P"].value, ex["model"].value.attributes) for ex in table), reverse=True)
+    save_figure(scored[:6], "VizRank")
+
+
+    # MODEL MAP
+
+    class ModelDistanceConstructor(distance.DistanceConstructor):
+
+        def __new__(cls, data=None):
+            self = distance.DistanceConstructor.__new__(cls)
+            return self.__call__(data) if data else self
+
+        def __call__(self, table):
+            return ModelDistance()
+
+    class ModelDistance(distance.Distance):
+        def __call__(self, e1, e2):
+            return mm.distance_manhattan(e1["model"].value, e2["model"].value)
+
+    def data_center(table):
+        onemodel = table[0]["model"].value
+        model = mm.Model("SCATTERPLOT", None,
+            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
+            onemodel.class_values, [],
+            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
+            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
+
+        return model.get_instance(table.domain)
+
+    clustering.kmeans.data_center = data_center
+    kmeans = clustering.kmeans.Clustering(table, centroids=centroids, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+
+    clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+
+    best_projs = []
+    for k, g in groupby(clusters, key=itemgetter(0)):
+        best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+
+    best_projs.sort(key=itemgetter(0), reverse=True)
+    scored = [(score, table[key]["attributes"].value.split(", ")) for score, key in best_projs]
+
+    save_figure(scored[:6], "Model Map")
+
+
+    # SAVE MODEL MAP
+
+    smx = build_map.build_model_matrix(models)
+    mm.save(os.path.join(ROOT, "_projections_", "scatterplots_%s" % DATASET), smx, table, build_map.data())
+
+#scatterplots_in_vr_mm("zoo", 10)
+scatterplots_in_vr_mm("vehicle", 6)