Source

orange-modelmaps / examples / projections / radviz.py

Full commit
import os.path, math, sys, random, itertools
import numpy as np

import matplotlib.pyplot as plt

import _modelmaps as mm

from itertools import groupby
from operator import itemgetter
from Orange import clustering, data, distance, network, utils
from Orange.orng import orngVizRank as vr
from matplotlib.patches import Circle

ROOT = "/home/miha/work/res/modelmaps"
#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"

def build_radviz(DATASET, ROOT, n_attributes):
    outname = os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s.bz2" % (n_attributes, DATASET, sys.platform))
    if os.path.exists(outname) and os.path.isfile(outname):
        return

    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
    if not (os.path.exists(fname) and os.path.isfile(fname)):
        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))

        if not (os.path.exists(fname) and os.path.isfile(fname)):
            raise IOError("File %s not found." % fname)

    build_map = mm.BuildModelMap(fname)

    print "get features..."
    if n_attributes == 3:
        features = sorted(mm.get_feature_subsets(build_map.data().domain, min_features=3, max_features=3))
    elif n_attributes == 4:
        features = mm.get_feature_subsets(build_map.data().domain, min_features=4, max_features=4)

        if len(features) > 3000:
            selectors = [1] * 3000 + [0] * (len(features) - 3000)
            random.shuffle(selectors)
            features = list(itertools.compress(features, selectors))

        for i in range(len(features)):
            f = features[i]
            features.append([f[1], f[0], f[2], f[3]])
            features.append([f[0], f[2], f[1], f[3]])
    else:
        raise AttributeError("Only Radviz on 3 or 4 features supported.")

    features.sort()
    print "build %d models..." % len(features)
    models = [build_map.build_projection_model(attrs, vr.RADVIZ) for attrs in features]
    print "build model data..."
    table = build_map.build_model_data(models)

    smx = build_map.build_model_matrix(models)
    mm.save(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s" % (n_attributes, DATASET, sys.platform)), smx, table, build_map.data())

def radviz_in_vr_mm_4(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):
    return radviz_in_vr_mm(smx, table, original_data, DATASET, ROOT, clusters, seed, n_attributes=4, iterative_clustering=iterative_clustering, linkage=linkage)

def radviz_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, n_attributes=3, iterative_clustering=False, linkage=None):

    # VIZRANK

    def save_figure(model_instances, method, clustering_type=""):
        fig = plt.figure(figsize=(6, 9), dpi=300)
        fig.subplots_adjust(wspace=0.3, hspace=0.2, top=0.9, bottom=0.01, left=0, right=0.90)

        for i, inst in enumerate(model_instances[:6]):
            add_subplot(fig, inst, i=(i + 1))

        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
        plt.savefig(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s_%s.pdf" %
            (n_attributes, DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))

    def add_subplot(fig, model_instance, i=1):
        graph = data.preprocess.scaling.ScaleLinProjData()
        graph.normalizeExamples = 1
        graph.setData(original_data, graph.rawSubsetData)

        attr_indices = [graph.attribute_name_index[attr] for attr in model_instance["attributes"].value.split(", ")]
        class_list = graph.original_data[graph.data_class_index]

        validData = graph.getValidList(attr_indices)
        transProjData = graph.createProjectionAsNumericArray(attr_indices, validData=validData, normalize=graph.normalizeExamples, jitterSize=-1, useAnchorData=1, removeMissingData=0)
        projData = transProjData.T

        ax = fig.add_subplot(3, 2, i)

        ax.scatter(projData[0], projData[1], c=class_list, s=50., alpha=0.75)

        xanchors = graph.create_xanchors(len(attr_indices))
        yanchors = graph.create_yanchors(len(attr_indices))

        ax.add_artist(Circle((0., 0.), radius=1, edgecolor="black", facecolor="none", alpha=0.75))
        ax.scatter(xanchors, yanchors, c="black", s=50., alpha=0.75)
        for x, y, attr in zip(xanchors, yanchors, model_instance["attributes"].value.split(", ")):
            ax.text(x*1.06, y*1.18-0.04, attr, size="x-small")


        ax.set_axis_off()
        ax.set_xlim(-1.1, 1.1)
        ax.set_ylim(-1.1, 1.1)

        ax.set_title(r"$\overline{P}=%.2f$" % (model_instance["P"].value * 100), weight='bold', size='medium', position=(0.5,   1.01),
                        horizontalalignment='center', verticalalignment='center')

    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)

    if DATASET is not None and ROOT is not None:
        save_figure(vr_models, "VizRank")


    # MODEL MAP

    class ModelDistanceConstructor(distance.DistanceConstructor):

        def __new__(cls, data=None):
            self = distance.DistanceConstructor.__new__(cls)
            return self.__call__(data) if data else self

        def __call__(self, table):
            return ModelDistance()

    class ModelDistance(distance.Distance):
        def __call__(self, e1, e2):
            return mm.distance_manhattan(e1["model"].value, e2["model"].value)

    def data_center(table):
        onemodel = table[0]["model"].value
        model = mm.Model("RADVIZ", None,
            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
            onemodel.class_values, [],
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))

        return model.get_instance(table.domain)

    table.random_generator = seed
    clustering.kmeans.data_center = data_center

    if clusters is not None and linkage:
        clustering_type = "hierarchical %s" % str(linkage).lower()
        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)

        best_projs = []
        for c in range(2, clusters + 1):
            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)

            best_projs_tmp = []
            for n, cluster in enumerate(topmost):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

    elif clusters is not None and iterative_clustering:
        clustering_type = "kmeans iterative"
        table.shuffle()
        best_projs = []
        for c in range(2, clusters + 1):
            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

            best_projs_tmp = []
            for k, g in groupby(clusters_, key=itemgetter(0)):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

        best_projs = best_projs[:clusters]
    else:
        if clusters is None:
            clustering_type = "community detection"
            graph = network.Graph()
            graph.add_nodes_from(range(smx.dim))
            graph.set_items(table)
            graph.nodes_iter()

            k = int(math.sqrt(smx.dim))
            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))

            clusters_ = network.community.label_propagation(graph, iterations=10000, seed=seed)
            print "Clusters:", len(set(clusters_.values()))
            clusters_ = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))

        else:
            clustering_type = "kmeans"
            table.shuffle()
            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

        best_projs = []
        for k, g in groupby(clusters_, key=itemgetter(0)):
            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

        best_projs.sort(key=itemgetter(0), reverse=True)

    mm_models = [table[key] for score, key in best_projs]

    if DATASET is not None and ROOT is not None:
        save_figure(mm_models, "Model Map", clustering_type)

    return vr_models, mm_models