1. Miha Stajdohar
  2. orange-modelmaps

Source

orange-modelmaps / examples / projections / radviz.py

Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 
Aleš Erjavec 4f4446f 
Miha Stajdohar e49e39e 


Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 




Miha Stajdohar 67ef2ae 



Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 


Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 























Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 







Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 





Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 
Miha Stajdohar c6b442c 
Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 



Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 
Miha Stajdohar c6b442c 

Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 

Miha Stajdohar 67ef2ae 



Miha Stajdohar e49e39e 



Miha Stajdohar 67ef2ae 
Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 


Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 

Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 


Miha Stajdohar e49e39e 
Miha Stajdohar 67ef2ae 


















































































import os.path, math, sys, random, itertools
import numpy as np

import matplotlib.pyplot as plt

import orangecontrib.modelmaps as mm

from itertools import groupby
from operator import itemgetter
from Orange import clustering, data, distance, network, utils
from Orange.orng import orngVizRank as vr
from matplotlib.patches import Circle

ROOT = "/home/miha/work/res/modelmaps"
#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"

def build_radviz(DATASET, ROOT, n_attributes):
    outname = os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s.bz2" % (n_attributes, DATASET, sys.platform))
    if os.path.exists(outname) and os.path.isfile(outname):
        return

    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
    if not (os.path.exists(fname) and os.path.isfile(fname)):
        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))

        if not (os.path.exists(fname) and os.path.isfile(fname)):
            raise IOError("File %s not found." % fname)

    build_map = mm.BuildModelMap(fname)

    print "get features..."
    if n_attributes == 3:
        features = sorted(mm.get_feature_subsets(build_map.data().domain, min_features=3, max_features=3))
    elif n_attributes == 4:
        features = mm.get_feature_subsets(build_map.data().domain, min_features=4, max_features=4)

        if len(features) > 3000:
            selectors = [1] * 3000 + [0] * (len(features) - 3000)
            random.shuffle(selectors)
            features = list(itertools.compress(features, selectors))

        for i in range(len(features)):
            f = features[i]
            features.append([f[1], f[0], f[2], f[3]])
            features.append([f[0], f[2], f[1], f[3]])
    else:
        raise AttributeError("Only Radviz on 3 or 4 features supported.")

    features.sort()
    print "build %d models..." % len(features)
    models = [build_map.build_projection_model(attrs, vr.RADVIZ) for attrs in features]
    print "build model data..."
    table = build_map.build_model_data(models)

    smx = build_map.build_model_matrix(models)
    mm.save(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s" % (n_attributes, DATASET, sys.platform)), smx, table, build_map.data())

def radviz_in_vr_mm_4(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):
    return radviz_in_vr_mm(smx, table, original_data, DATASET, ROOT, clusters, seed, n_attributes=4, iterative_clustering=iterative_clustering, linkage=linkage)

def radviz_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, n_attributes=3, iterative_clustering=False, linkage=None):

    # VIZRANK

    def save_figure(model_instances, method, clustering_type=""):
        fig = plt.figure(figsize=(6, 9), dpi=300)
        fig.subplots_adjust(wspace=0.3, hspace=0.2, top=0.9, bottom=0.01, left=0, right=0.90)

        for i, inst in enumerate(model_instances[:6]):
            add_subplot(fig, inst, i=(i + 1))

        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
        plt.savefig(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s_%s.pdf" %
            (n_attributes, DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))

    def add_subplot(fig, model_instance, i=1):
        graph = data.preprocess.scaling.ScaleLinProjData()
        graph.normalizeExamples = 1
        graph.setData(original_data, graph.rawSubsetData)

        attr_indices = [graph.attribute_name_index[attr] for attr in model_instance["attributes"].value.split(", ")]
        class_list = graph.original_data[graph.data_class_index]

        validData = graph.getValidList(attr_indices)
        transProjData = graph.createProjectionAsNumericArray(attr_indices, validData=validData, normalize=graph.normalizeExamples, jitterSize=0.1, useAnchorData=1, removeMissingData=0)
        projData = transProjData.T

        ax = fig.add_subplot(3, 2, i)

        ax.scatter(projData[0], projData[1], c=class_list, s=50., alpha=0.75)

        xanchors = graph.create_xanchors(len(attr_indices))
        yanchors = graph.create_yanchors(len(attr_indices))

        ax.add_artist(Circle((0., 0.), radius=1, edgecolor="black", facecolor="none", alpha=0.75))
        ax.scatter(xanchors, yanchors, c="black", s=50., alpha=0.75)
        for x, y, attr in zip(xanchors, yanchors, model_instance["attributes"].value.split(", ")):
            ax.text(x*1.06, y*1.18-0.04, attr, size="x-small")


        ax.set_axis_off()
        ax.set_xlim(-1.2, 1.2)
        ax.set_ylim(-1.2, 1.2)

        ax.set_title(r"$\overline{P}=%.2f$" % (model_instance["P"].value * 100), weight='bold', size='medium', position=(0.5,   1.01),
                        horizontalalignment='center', verticalalignment='center')

    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)

    if DATASET is not None and ROOT is not None:
        save_figure(vr_models, "VizRank")


    # MODEL MAP

    class ModelDistanceConstructor(distance.DistanceConstructor):

        def __new__(cls, data=None):
            self = distance.DistanceConstructor.__new__(cls)
            return self.__call__(data) if data else self

        def __call__(self, table):
            return ModelDistance()

    class ModelDistance(distance.Distance):
        def __call__(self, e1, e2):
            return mm.distance_manhattan(e1["model"].value, e2["model"].value)

    def data_center(table):
        onemodel = table[0]["model"].value
        model = mm.Model("RADVIZ", None,
            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
            onemodel.class_values, [],
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))

        return model.get_instance(table.domain)

    table.random_generator = seed
    clustering.kmeans.data_center = data_center

    if clusters is not None and linkage:
        clustering_type = "hierarchical %s" % str(linkage).lower()
        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)

        best_projs = []
        for c in range(2, clusters + 1):
            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)

            best_projs_tmp = []
            for n, cluster in enumerate(topmost):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

    elif clusters is not None and iterative_clustering:
        clustering_type = "kmeans iterative"
        table.shuffle()
        best_projs = []
        for c in range(2, clusters + 1):
            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

            best_projs_tmp = []
            for k, g in groupby(clusters_, key=itemgetter(0)):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

        best_projs = best_projs[:clusters]
    else:
        if clusters is None:
            clustering_type = "community detection"
            graph = network.Graph()
            graph.add_nodes_from(range(smx.dim))
            graph.set_items(table)
            graph.nodes_iter()

            k = int(math.sqrt(smx.dim))
            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))

            clusters_ = network.community.label_propagation(graph, iterations=10000, seed=seed)
            print "Clusters:", len(set(clusters_.values()))
            clusters_ = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))

        else:
            clustering_type = "kmeans"
            table.shuffle()
            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

        best_projs = []
        for k, g in groupby(clusters_, key=itemgetter(0)):
            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

        best_projs.sort(key=itemgetter(0), reverse=True)

    mm_models = [table[key] for score, key in best_projs]

    if DATASET is not None and ROOT is not None:
        save_figure(mm_models, "Model Map", clustering_type)

    return vr_models, mm_models