Source

orange-modelmaps / examples / projections / scatterplot.py

import os.path, math, sys
import numpy as np

import matplotlib.pyplot as plt

import _modelmaps as mm

from itertools import groupby
from operator import itemgetter
from Orange import clustering, data, distance, network, utils
from Orange.orng import orngVizRank as vr

ROOT = "/home/miha/work/res/modelmaps"
#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"

def build_scatterplots(DATASET, ROOT):
    outname = os.path.join(ROOT, "_projections_", "scatterplots_%s_%s.bz2" % (DATASET, sys.platform))
    if os.path.exists(outname) and os.path.isfile(outname):
        return

    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
    if not (os.path.exists(fname) and os.path.isfile(fname)):
        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))

        if not (os.path.exists(fname) and os.path.isfile(fname)):
            raise IOError("File %s not found." % fname)

    build_map = mm.BuildModelMap(fname)
    nfeatures = len(build_map.data_d.domain.features)

    max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
    print "get features..."
    features_scatterplot = sorted(mm.get_feature_subsets_scatterplot(build_map.data().domain, max_nfeatures_scatterplot))
    print "build %d models..." % len(features_scatterplot)
    models = [build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot]
    print "build model data..."
    table = build_map.build_model_data(models)

    smx = build_map.build_model_matrix(models)
    mm.save(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s" % (DATASET, sys.platform)), smx, table, build_map.data())

def scatterplots_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):

    # VIZRANK

    def save_figure(models, method, clustering_type=""):
        fig = plt.figure(figsize=(6, 9), dpi=300)
        fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)

        for i, model in enumerate(models[:6]):
            add_subplot(fig, model["P"].value, model["attributes"].value.split(", "), i=(i + 1))

        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
        plt.savefig(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s_%s.pdf" %
            (DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))

    def add_subplot(fig, score, attrs, i=1):
        graph = data.preprocess.scaling.ScaleScatterPlotData()
        graph.setData(original_data, graph.rawSubsetData)
        attr_indices = [graph.attribute_name_index[attr] for attr in attrs]
        selected_data = np.take(graph.scaled_data, attr_indices, axis=0)
        class_list = graph.original_data[graph.data_class_index]

        ax = fig.add_subplot(3, 2, i)

        x_dom = set(selected_data[0])
        if len(x_dom) < 10:
            ax.set_xticklabels(list(set(selected_data[0])), size='x-small')
        else:
            for label in ax.get_xticklabels():
                label.set_fontsize('x-small')

        y_dom = set(selected_data[1])
        if len(y_dom) < 10:
            ax.set_yticklabels(list(set(selected_data[1])), size='x-small')
        else:
            for label in ax.get_yticklabels():
                label.set_fontsize('x-small')

        ax.scatter(selected_data[0], selected_data[1], c=class_list, s=50., alpha=0.75)

        ax.set_xlabel(attrs[0], size='small')
        ax.set_ylabel(attrs[1], size='small')

        ax.set_title(r"$\overline{P}=%.2f$" % (score*100), weight='bold', size='medium', position=(0.5, 1.1),
                        horizontalalignment='center', verticalalignment='center')

    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)

    if original_data is not None and DATASET is not None and ROOT is not None:
        save_figure(vr_models, "VizRank")


    # MODEL MAP

    class ModelDistanceConstructor(distance.DistanceConstructor):

        def __new__(cls, data=None):
            self = distance.DistanceConstructor.__new__(cls)
            return self.__call__(data) if data else self

        def __call__(self, table):
            return ModelDistance()

    class ModelDistance(distance.Distance):
        def __call__(self, e1, e2):
            return mm.distance_manhattan(e1["model"].value, e2["model"].value)

    def data_center(table):
        onemodel = table[0]["model"].value
        model = mm.Model("SCATTERPLOT", None,
            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
            onemodel.class_values, [],
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))

        return model.get_instance(table.domain)

    table.random_generator = seed
    clustering.kmeans.data_center = data_center

    if clusters is not None and linkage:
        clustering_type = "hierarchical %s" % str(linkage).lower()
        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)

        best_projs = []
        for c in range(2, clusters + 1):
            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)

            best_projs_tmp = []
            for n, cluster in enumerate(topmost):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

    elif clusters is not None and iterative_clustering:
        clustering_type = "kmeans iterative"
        table.shuffle()
        best_projs = []
        for c in range(2, clusters + 1):
            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

            best_projs_tmp = []
            for k, g in groupby(clusters_, key=itemgetter(0)):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

        best_projs = best_projs[:clusters]
    else:
        if clusters is None:
            clustering_type = "community detection"
            graph = network.Graph()
            graph.add_nodes_from(range(smx.dim))
            graph.set_items(table)
            graph.nodes_iter()

            k = int(math.sqrt(smx.dim))
            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))

            clusters = network.community.label_propagation(graph, iterations=10000, seed=seed)
            clusters = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))

        else:
            clustering_type = "kmeans"
            table.shuffle()
            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

        best_projs = []
        for k, g in groupby(clusters, key=itemgetter(0)):
            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

        best_projs.sort(key=itemgetter(0), reverse=True)

    mm_models = [table[key] for score, key in best_projs]

    if original_data is not None and DATASET is not None and ROOT is not None:
        save_figure(mm_models, "Model Map", clustering_type)

    return vr_models, mm_models

#scatterplots_in_vr_mm("zoo", 10)
#scatterplots_in_vr_mm("vehicle", 6)