Miha Stajdohar avatar Miha Stajdohar committed 67ef2ae

Mostly bug fixes. Also, performance improvement on datasets with large number of instances.

Comments (0)

Files changed (4)

_modelmaps/modelmap.py

 
 """
 
-import bz2, itertools, math, random, os.path, time, uuid
+import bz2, itertools, math, random, os.path, time, uuid, sys
 import cPickle as pickle
 
 import scipy.stats
     nattrs = len(attrs)
     total = sum(binomial(nattrs, i) for i in range(min_features, max_features + 1))
 
-    nsubsets = total if nsubsets is None else nsubsets
-
-    if nsubsets > total:
+    if nsubsets is not None and nsubsets > total:
         raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % total)
 
-    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(min_features, max_features + 1))))
-    selectors = [1] * nsubsets + [0] * (total - nsubsets)
-    random.shuffle(selectors)
-    return list(itertools.compress(combinations, selectors))
+    if min_features == max_features:
+        combinations = itertools.combinations(attrs, max_features)
+    else:
+        combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(min_features, max_features + 1))))
+
+    if nsubsets is None:
+        return list(combinations)
+    else:
+        selectors = [1] * nsubsets + [0] * (total - nsubsets)
+        random.shuffle(selectors)
+        return list(itertools.compress(combinations, selectors))
+    #return list(itertools.compress(combinations, xrange(10)))
 
 def get_models_table():
     """Return an empty data table for model meta data."""
 class BuildModelMap(object):
 
     def __init__(self, fname, folds=10, model_limit=500):
-        self.folds = folds
+
         self.model_limit = model_limit
         self.data_d = self._get_data(fname)
         #self.data_c = self._get_data(fname, continuize=True)
         self.data_d = data.filter.IsDefined(domain=self.data_d.domain)(self.data_d)
 
+        self.folds = folds if len(self.data_d) < 2000 else 2
+
         self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)
 
     def _get_data(self, fname, continuize=False):
 
         for fold in range(self.folds):
             learnset = table.selectref(self.indices, fold, negate=1)
-            testset = table.selectref(self.indices, fold, negate=0)
+            testset = table.select(self.indices, fold, negate=0)
             classifier = learner(learnset)
 
-            for test_ex in testset:
-                ex = data.Instance(test_ex)
+            for ex in testset:
+                instance_classes.append(ex.get_class().value)
                 ex.setclass("?")
-
                 cl, prob = classifier(ex, classifier.GetBoth)
-                if cl.isSpecial():
-                    raise "Classifier %s returned unknown value" % (classifier.name)
                 probabilities.append(list(prob))
                 instance_predictions.append(cl.value)
-                instance_classes.append(test_ex.get_class().value)
+
+            if len(table) > 2000:
+                break
 
         return Model(method,
-                     learner(table),
+                     None, #learner(table),
                      np.array(probabilities),
                      {val: i for i, val in enumerate(self.data_d.domain.class_var.values)},
                      attributes,
                      XAnchors=XAnchors,
                      YAnchors=YAnchors)
 
+
     def build_rf_models(self, data):
         probabilities = [[] for fold in self.folds]
 

examples/projections/distance_metric_comparisson.py

 __author__ = 'Miha Stajdohar'
 
 import cPickle as pickle
-import os.path, sys
+import os, os.path, sys
 import scipy
 import numpy as np
 import _modelmaps as mm
 def build_map_for_metric_comparisson(DATASET, N):
     print "DATA SET: %s" % DATASET
 
-    build_map = mm.BuildModelMap(os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab")))
+    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+
+    if not (os.path.exists(fname) and os.path.isfile(fname)):
+        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            raise IOError("File %s not found." % fname)
+
+    build_map = mm.BuildModelMap(fname)
 
     nfeatures = len(build_map.data_d.domain.features)
-    features = mm.get_feature_subsets(build_map.data_d.domain, N)
+    features = mm.get_feature_subsets(build_map.data_d.domain, N, min_features=3, max_features=8)
 
     max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
     features_scatterplot = mm.get_feature_subsets_scatterplot(build_map.data_d.domain, max_nfeatures_scatterplot)
         print keys[i], "  ",
         print "  ".join(["%s: %e" % (keys[j], res[keys[i]][keys[j]]["rank p"]) for j in range(i)])
 
-smxs = build_map_for_metric_comparisson("zoo", N=1000)
+#smxs = build_map_for_metric_comparisson("zoo", N=1000)
 #smxs, table, data = mm.load(os.path.join(ROOT, "_projections_", "proj_alldist_4_zoo_1000"))
-matrix_correlation(smxs)
+#matrix_correlation(smxs)
+
+#smxs = build_map_for_metric_comparisson("breast-cancer-wisconsin", N=501)
+#matrix_correlation(smxs)
 
 #smxs = build_map_for_metric_comparisson("wine", N=1000)
 #matrix_correlation(smxs)
 #smxs = build_map_for_metric_comparisson("heart_disease", N=1000)
 #matrix_correlation(smxs)
 
-smxs = build_map_for_metric_comparisson("breast-cancer-wisconsin-cont", N=501)
+smxs = build_map_for_metric_comparisson("dermatology", N=1000)
 matrix_correlation(smxs)

examples/projections/radviz.py

-import os.path
+import os.path, math, sys, random, itertools
 import numpy as np
+
 import matplotlib.pyplot as plt
+
 import _modelmaps as mm
 
 from itertools import groupby
 from operator import itemgetter
-from Orange import clustering, data, distance, utils
+from Orange import clustering, data, distance, network, utils
 from Orange.orng import orngVizRank as vr
+from matplotlib.patches import Circle
 
 ROOT = "/home/miha/work/res/modelmaps"
 #ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
 ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
 
-def radviz_in_vr_mm(DATASET, centroids):
-    build_map = mm.BuildModelMap(os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab")))
-    nfeatures = len(build_map.data_d.domain.features)
+def build_radviz(DATASET, ROOT, n_attributes):
+    outname = os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s.bz2" % (n_attributes, DATASET, sys.platform))
+    if os.path.exists(outname) and os.path.isfile(outname):
+        return
 
-    features = mm.get_feature_subsets(build_map.data().domain, min_features=3, max_features=3)
+    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+    if not (os.path.exists(fname) and os.path.isfile(fname)):
+        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
 
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            raise IOError("File %s not found." % fname)
 
-    models = []
-    models.extend(build_map.build_projection_model(attrs, vr.RADVIZ) for attrs in features)
+    build_map = mm.BuildModelMap(fname)
+
+    print "get features..."
+    if n_attributes == 3:
+        features = sorted(mm.get_feature_subsets(build_map.data().domain, min_features=3, max_features=3))
+    elif n_attributes == 4:
+        features = mm.get_feature_subsets(build_map.data().domain, min_features=4, max_features=4)
+
+        if len(features) > 3000:
+            selectors = [1] * 3000 + [0] * (len(features) - 3000)
+            random.shuffle(selectors)
+            features = list(itertools.compress(features, selectors))
+
+        for i in range(len(features)):
+            f = features[i]
+            features.append([f[1], f[0], f[2], f[3]])
+            features.append([f[0], f[2], f[1], f[3]])
+    else:
+        raise AttributeError("Only Radviz on 3 or 4 features supported.")
+
+    features.sort()
+    print "build %d models..." % len(features)
+    models = [build_map.build_projection_model(attrs, vr.RADVIZ) for attrs in features]
+    print "build model data..."
     table = build_map.build_model_data(models)
 
+    smx = build_map.build_model_matrix(models)
+    mm.save(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s" % (n_attributes, DATASET, sys.platform)), smx, table, build_map.data())
+
+def radviz_in_vr_mm_4(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):
+    return radviz_in_vr_mm(smx, table, original_data, DATASET, ROOT, clusters, seed, n_attributes=4, iterative_clustering=iterative_clustering, linkage=linkage)
+
+def radviz_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, n_attributes=3, iterative_clustering=False, linkage=None):
+
     # VIZRANK
 
-    def save_figure(model_instances, method):
+    def save_figure(model_instances, method, clustering_type=""):
         fig = plt.figure(figsize=(6, 9), dpi=300)
-        fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)
+        fig.subplots_adjust(wspace=0.3, hspace=0.2, top=0.9, bottom=0.01, left=0, right=0.90)
 
-        for i, (score, attr) in enumerate(scored_attributes):
-            add_subplot(fig, score, attr, i=(i + 1))
+        for i, inst in enumerate(model_instances[:6]):
+            add_subplot(fig, inst, i=(i + 1))
 
         plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
-        plt.savefig(os.path.join(ROOT, "_projections_", "radviz_%s_%s.pdf" % (DATASET, method.lower().replace(" ", ""))))
+        plt.savefig(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s_%s.pdf" %
+            (n_attributes, DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))
 
-    def add_subplot(fig, score, attrs, i=1):
-        graph = data.preprocess.scaling.ScaleScatterPlotData()
-        graph.setData(build_map.data(), graph.rawSubsetData)
-        attr_indices = [graph.attribute_name_index[attr] for attr in attrs]
-        selected_data = np.take(graph.scaled_data, attr_indices, axis=0)
+    def add_subplot(fig, model_instance, i=1):
+        graph = data.preprocess.scaling.ScaleLinProjData()
+        graph.normalizeExamples = 1
+        graph.setData(original_data, graph.rawSubsetData)
+
+        attr_indices = [graph.attribute_name_index[attr] for attr in model_instance["attributes"].value.split(", ")]
         class_list = graph.original_data[graph.data_class_index]
 
+        validData = graph.getValidList(attr_indices)
+        transProjData = graph.createProjectionAsNumericArray(attr_indices, validData=validData, normalize=graph.normalizeExamples, jitterSize=-1, useAnchorData=1, removeMissingData=0)
+        projData = transProjData.T
+
         ax = fig.add_subplot(3, 2, i)
 
-        x_dom = set(selected_data[0])
-        if len(x_dom) < 10:
-            ax.set_xticklabels(list(set(selected_data[0])), size='x-small')
-        else:
-            for label in ax.get_xticklabels():
-                label.set_fontsize('x-small')
+        ax.scatter(projData[0], projData[1], c=class_list, s=50., alpha=0.75)
 
-        y_dom = set(selected_data[1])
-        if len(y_dom) < 10:
-            ax.set_yticklabels(list(set(selected_data[1])), size='x-small')
-        else:
-            for label in ax.get_yticklabels():
-                label.set_fontsize('x-small')
+        xanchors = graph.create_xanchors(len(attr_indices))
+        yanchors = graph.create_yanchors(len(attr_indices))
 
-        ax.scatter(selected_data[0], selected_data[1], c=class_list, s=50., alpha=0.75)
+        ax.add_artist(Circle((0., 0.), radius=1, edgecolor="black", facecolor="none", alpha=0.75))
+        ax.scatter(xanchors, yanchors, c="black", s=50., alpha=0.75)
+        for x, y, attr in zip(xanchors, yanchors, model_instance["attributes"].value.split(", ")):
+            ax.text(x*1.06, y*1.18-0.04, attr, size="x-small")
 
-        ax.set_xlabel(attrs[0], size='small')
-        ax.set_ylabel(attrs[1], size='small')
 
-        ax.set_title(r"$\overline{P}=%.2f$" % (score*100), weight='bold', size='medium', position=(0.5, 1.1),
+        ax.set_axis_off()
+        ax.set_xlim(-1.1, 1.1)
+        ax.set_ylim(-1.1, 1.1)
+
+        ax.set_title(r"$\overline{P}=%.2f$" % (model_instance["P"].value * 100), weight='bold', size='medium', position=(0.5,   1.01),
                         horizontalalignment='center', verticalalignment='center')
 
-    scored = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)
-    save_figure(scored[:6], "VizRank")
+    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)
+
+    if DATASET is not None and ROOT is not None:
+        save_figure(vr_models, "VizRank")
 
 
     # MODEL MAP
 
-#    class ModelDistanceConstructor(distance.DistanceConstructor):
-#
-#        def __new__(cls, data=None):
-#            self = distance.DistanceConstructor.__new__(cls)
-#            return self.__call__(data) if data else self
-#
-#        def __call__(self, table):
-#            return ModelDistance()
-#
-#    class ModelDistance(distance.Distance):
-#        def __call__(self, e1, e2):
-#            return mm.distance_manhattan(e1["model"].value, e2["model"].value)
-#
-#    def data_center(table):
-#        onemodel = table[0]["model"].value
-#        model = mm.Model("SCATTERPLOT", None,
-#            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
-#            onemodel.class_values, [],
-#            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
-#            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
-#
-#        return model.get_instance(table.domain)
-#
-#    clustering.kmeans.data_center = data_center
-#    kmeans = clustering.kmeans.Clustering(table, centroids=centroids, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
-#
-#    clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
-#
-#    best_projs = []
-#    for k, g in groupby(clusters, key=itemgetter(0)):
-#        best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
-#
-#    best_projs.sort(key=itemgetter(0), reverse=True)
-#    scored = [(score, table[key]["attributes"].value.split(", ")) for score, key in best_projs]
-#
-#    save_figure(scored[:6], "Model Map")
+    class ModelDistanceConstructor(distance.DistanceConstructor):
 
+        def __new__(cls, data=None):
+            self = distance.DistanceConstructor.__new__(cls)
+            return self.__call__(data) if data else self
 
-    # SAVE MODEL MAP
+        def __call__(self, table):
+            return ModelDistance()
 
-    smx = build_map.build_model_matrix(models)
-    mm.save(os.path.join(ROOT, "_projections_", "radviz_%s" % DATASET), smx, table, build_map.data())
+    class ModelDistance(distance.Distance):
+        def __call__(self, e1, e2):
+            return mm.distance_manhattan(e1["model"].value, e2["model"].value)
 
-radviz_in_vr_mm("zoo", 10)
-#radviz_in_vr_mm("vehicle", 6)
+    def data_center(table):
+        onemodel = table[0]["model"].value
+        model = mm.Model("RADVIZ", None,
+            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
+            onemodel.class_values, [],
+            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
+            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))
+
+        return model.get_instance(table.domain)
+
+    table.random_generator = seed
+    clustering.kmeans.data_center = data_center
+
+    if clusters is not None and linkage:
+        clustering_type = "hierarchical %s" % str(linkage).lower()
+        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)
+
+        best_projs = []
+        for c in range(2, clusters + 1):
+            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)
+
+            best_projs_tmp = []
+            for n, cluster in enumerate(topmost):
+                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))
+
+            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
+            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])
+
+            if len(best_projs) >= clusters:
+                break
+
+    elif clusters is not None and iterative_clustering:
+        clustering_type = "kmeans iterative"
+        table.shuffle()
+        best_projs = []
+        for c in range(2, clusters + 1):
+            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+
+            best_projs_tmp = []
+            for k, g in groupby(clusters_, key=itemgetter(0)):
+                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+
+            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
+            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])
+
+            if len(best_projs) >= clusters:
+                break
+
+        best_projs = best_projs[:clusters]
+    else:
+        if clusters is None:
+            clustering_type = "community detection"
+            graph = network.Graph()
+            graph.add_nodes_from(range(smx.dim))
+            graph.set_items(table)
+            graph.nodes_iter()
+
+            k = int(math.sqrt(smx.dim))
+            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
+            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))
+
+            clusters_ = network.community.label_propagation(graph, iterations=10000, seed=seed)
+            print "Clusters:", len(set(clusters_.values()))
+            clusters_ = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))
+
+        else:
+            clustering_type = "kmeans"
+            table.shuffle()
+            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+
+        best_projs = []
+        for k, g in groupby(clusters_, key=itemgetter(0)):
+            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+
+        best_projs.sort(key=itemgetter(0), reverse=True)
+
+    mm_models = [table[key] for score, key in best_projs]
+
+    if DATASET is not None and ROOT is not None:
+        save_figure(mm_models, "Model Map", clustering_type)
+
+    return vr_models, mm_models

examples/projections/scatterplot.py

-import os.path
+import os.path, math, sys
 import numpy as np
+
 import matplotlib.pyplot as plt
+
 import _modelmaps as mm
 
 from itertools import groupby
 from operator import itemgetter
-from Orange import clustering, data, distance, utils
+from Orange import clustering, data, distance, network, utils
 from Orange.orng import orngVizRank as vr
 
 ROOT = "/home/miha/work/res/modelmaps"
 #ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
 ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"
 
-def scatterplots_in_vr_mm(DATASET, centroids):
-    build_map = mm.BuildModelMap(os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab")))
+def build_scatterplots(DATASET, ROOT):
+    outname = os.path.join(ROOT, "_projections_", "scatterplots_%s_%s.bz2" % (DATASET, sys.platform))
+    if os.path.exists(outname) and os.path.isfile(outname):
+        return
+
+    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
+    if not (os.path.exists(fname) and os.path.isfile(fname)):
+        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))
+
+        if not (os.path.exists(fname) and os.path.isfile(fname)):
+            raise IOError("File %s not found." % fname)
+
+    build_map = mm.BuildModelMap(fname)
     nfeatures = len(build_map.data_d.domain.features)
 
     max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
-    features_scatterplot = mm.get_feature_subsets_scatterplot(build_map.data().domain, max_nfeatures_scatterplot)
+    print "get features..."
+    features_scatterplot = sorted(mm.get_feature_subsets_scatterplot(build_map.data().domain, max_nfeatures_scatterplot))
+    print "build %d models..." % len(features_scatterplot)
+    models = [build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot]
+    print "build model data..."
+    table = build_map.build_model_data(models)
 
+    smx = build_map.build_model_matrix(models)
+    mm.save(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s" % (DATASET, sys.platform)), smx, table, build_map.data())
 
-    models = []
-    models.extend(build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot)
-    table = build_map.build_model_data(models)
+def scatterplots_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):
 
     # VIZRANK
 
-    def save_figure(scored_attributes, method):
+    def save_figure(models, method, clustering_type=""):
         fig = plt.figure(figsize=(6, 9), dpi=300)
         fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)
 
-        for i, (score, attr) in enumerate(scored_attributes):
-            add_subplot(fig, score, attr, i=(i + 1))
+        for i, model in enumerate(models[:6]):
+            add_subplot(fig, model["P"].value, model["attributes"].value.split(", "), i=(i + 1))
 
         plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
-        plt.savefig(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s.pdf" % (DATASET, method.lower().replace(" ", ""))))
+        plt.savefig(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s_%s.pdf" %
+            (DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))
 
     def add_subplot(fig, score, attrs, i=1):
         graph = data.preprocess.scaling.ScaleScatterPlotData()
-        graph.setData(build_map.data(), graph.rawSubsetData)
+        graph.setData(original_data, graph.rawSubsetData)
         attr_indices = [graph.attribute_name_index[attr] for attr in attrs]
         selected_data = np.take(graph.scaled_data, attr_indices, axis=0)
         class_list = graph.original_data[graph.data_class_index]
         ax.set_title(r"$\overline{P}=%.2f$" % (score*100), weight='bold', size='medium', position=(0.5, 1.1),
                         horizontalalignment='center', verticalalignment='center')
 
-    scored = sorted(((ex["P"].value, ex["model"].value.attributes) for ex in table), reverse=True)
-    save_figure(scored[:6], "VizRank")
+    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)
+
+    if original_data is not None and DATASET is not None and ROOT is not None:
+        save_figure(vr_models, "VizRank")
 
 
     # MODEL MAP
 
         return model.get_instance(table.domain)
 
+    table.random_generator = seed
     clustering.kmeans.data_center = data_center
-    kmeans = clustering.kmeans.Clustering(table, centroids=centroids, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
 
-    clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+    if clusters is not None and linkage:
+        clustering_type = "hierarchical %s" % str(linkage).lower()
+        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)
 
-    best_projs = []
-    for k, g in groupby(clusters, key=itemgetter(0)):
-        best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+        best_projs = []
+        for c in range(2, clusters + 1):
+            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)
 
-    best_projs.sort(key=itemgetter(0), reverse=True)
-    scored = [(score, table[key]["attributes"].value.split(", ")) for score, key in best_projs]
+            best_projs_tmp = []
+            for n, cluster in enumerate(topmost):
+                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))
 
-    save_figure(scored[:6], "Model Map")
+            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
+            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])
 
+            if len(best_projs) >= clusters:
+                break
 
-    # SAVE MODEL MAP
+    elif clusters is not None and iterative_clustering:
+        clustering_type = "kmeans iterative"
+        table.shuffle()
+        best_projs = []
+        for c in range(2, clusters + 1):
+            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
 
-    smx = build_map.build_model_matrix(models)
-    mm.save(os.path.join(ROOT, "_projections_", "scatterplots_%s" % DATASET), smx, table, build_map.data())
+            best_projs_tmp = []
+            for k, g in groupby(clusters_, key=itemgetter(0)):
+                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+
+            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
+            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])
+
+            if len(best_projs) >= clusters:
+                break
+
+        best_projs = best_projs[:clusters]
+    else:
+        if clusters is None:
+            clustering_type = "community detection"
+            graph = network.Graph()
+            graph.add_nodes_from(range(smx.dim))
+            graph.set_items(table)
+            graph.nodes_iter()
+
+            k = int(math.sqrt(smx.dim))
+            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
+            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))
+
+            clusters = network.community.label_propagation(graph, iterations=10000, seed=seed)
+            clusters = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))
+
+        else:
+            clustering_type = "kmeans"
+            table.shuffle()
+            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
+            clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))
+
+        best_projs = []
+        for k, g in groupby(clusters, key=itemgetter(0)):
+            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))
+
+        best_projs.sort(key=itemgetter(0), reverse=True)
+
+    mm_models = [table[key] for score, key in best_projs]
+
+    if original_data is not None and DATASET is not None and ROOT is not None:
+        save_figure(mm_models, "Model Map", clustering_type)
+
+    return vr_models, mm_models
 
 #scatterplots_in_vr_mm("zoo", 10)
-scatterplots_in_vr_mm("vehicle", 6)
+#scatterplots_in_vr_mm("vehicle", 6)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.