Source

orange-modelmaps / examples / projections / scatterplot.py

import os.path, math, sys
import numpy as np

import matplotlib.pyplot as plt

import orangecontrib.modelmaps as mm

from itertools import groupby
from operator import itemgetter
from Orange import clustering, data, distance, network, utils
from Orange.orng import orngVizRank as vr

ROOT = "/home/miha/work/res/modelmaps"
#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"

def build_scatterplots(DATASET, ROOT):
    outname = os.path.join(ROOT, "_projections_", "scatterplots_%s_%s.bz2" % (DATASET, sys.platform))
    if os.path.exists(outname) and os.path.isfile(outname):
        return

    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
    if not (os.path.exists(fname) and os.path.isfile(fname)):
        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))

        if not (os.path.exists(fname) and os.path.isfile(fname)):
            raise IOError("File %s not found." % fname)

    build_map = mm.BuildModelMap(fname)
    nfeatures = len(build_map.data_d.domain.features)

    max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
    print "get features..."
    features_scatterplot = sorted(mm.get_feature_subsets_scatterplot(build_map.data().domain, max_nfeatures_scatterplot))
    print "build %d models..." % len(features_scatterplot)
    models = [build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot]
    print "build model data..."
    table = build_map.build_model_data(models)

    smx = build_map.build_model_matrix(models)
    mm.save(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s" % (DATASET, sys.platform)), smx, table, build_map.data())

def scatterplots_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):

    # VIZRANK

    def save_figure(models, method, clustering_type=""):
        fig = plt.figure(figsize=(6, 9), dpi=300)
        fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.9, bottom=0.05, left=0.1, right=0.95)

        for i, model in enumerate(models[:6]):
            add_subplot(fig, model["P"].value, model["attributes"].value.split(", "), i=(i + 1))

        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
        plt.savefig(os.path.join(ROOT, "_projections_", "scatterplots_%s_%s_%s.pdf" %
            (DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))

    def add_subplot(fig, score, attrs, i=1):
        graph = data.preprocess.scaling.ScaleScatterPlotData()
        graph.jitter_continuous = True

        graph.setData(original_data, graph.rawSubsetData)
        graph.rescaleData()

        attr_indices = [graph.attribute_name_index[attr] for attr in attrs]
        selected_data = np.take(graph.scaled_data, attr_indices, axis=0)
        class_list = graph.original_data[graph.data_class_index]

        ax = fig.add_subplot(3, 2, i)

        x_dom = set(selected_data[0])
        if len(x_dom) < 10:
            ax.set_xticklabels(list(set(selected_data[0])), size='x-small')
        else:
            for label in ax.get_xticklabels():
                label.set_fontsize('x-small')

        y_dom = set(selected_data[1])
        if len(y_dom) < 10:
            ax.set_yticklabels(list(set(selected_data[1])), size='x-small')
        else:
            for label in ax.get_yticklabels():
                label.set_fontsize('x-small')

        ax.scatter(selected_data[0], selected_data[1], c=class_list, s=50., alpha=0.75)

        ax.set_xlabel(attrs[0], size='small')
        ax.set_ylabel(attrs[1], size='small')

        ax.set_title(r"$\overline{P}=%.2f$" % (score*100), weight='bold', size='medium', position=(0.5, 1.1),
                        horizontalalignment='center', verticalalignment='center')

    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)

    if original_data is not None and DATASET is not None and ROOT is not None:
        save_figure(vr_models, "VizRank")


    # MODEL MAP

    class ModelDistanceConstructor(distance.DistanceConstructor):

        def __new__(cls, data=None):
            self = distance.DistanceConstructor.__new__(cls)
            return self.__call__(data) if data else self

        def __call__(self, table):
            return ModelDistance()

    class ModelDistance(distance.Distance):
        def __call__(self, e1, e2):
            return mm.distance_manhattan(e1["model"].value, e2["model"].value)

    def data_center(table):
        onemodel = table[0]["model"].value
        model = mm.Model("SCATTERPLOT", None,
            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
            onemodel.class_values, [],
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))

        return model.get_instance(table.domain)

    table.random_generator = seed
    clustering.kmeans.data_center = data_center

    if clusters is not None and linkage:
        clustering_type = "hierarchical %s" % str(linkage).lower()
        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)

        best_projs = []
        for c in range(2, clusters + 1):
            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)

            best_projs_tmp = []
            for n, cluster in enumerate(topmost):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

    elif clusters is not None and iterative_clustering:
        clustering_type = "kmeans iterative"
        table.shuffle()
        best_projs = []
        for c in range(2, clusters + 1):
            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

            best_projs_tmp = []
            for k, g in groupby(clusters_, key=itemgetter(0)):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

        best_projs = best_projs[:clusters]
    else:
        if clusters is None:
            clustering_type = "community detection"
            graph = network.Graph()
            graph.add_nodes_from(range(smx.dim))
            graph.set_items(table)
            graph.nodes_iter()

            k = int(math.sqrt(smx.dim))
            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))

            clusters = network.community.label_propagation(graph, iterations=10000, seed=seed)
            clusters = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))

        else:
            clustering_type = "kmeans"
            table.shuffle()
            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

        best_projs = []
        for k, g in groupby(clusters, key=itemgetter(0)):
            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

        best_projs.sort(key=itemgetter(0), reverse=True)

    mm_models = [table[key] for score, key in best_projs]

    if original_data is not None and DATASET is not None and ROOT is not None:
        save_figure(mm_models, "Model Map", clustering_type)

    return vr_models, mm_models

#scatterplots_in_vr_mm("zoo", 10)
#scatterplots_in_vr_mm("vehicle", 6)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.