orange-modelmaps / examples / projections / radviz.py

import os.path, math, sys, random, itertools
import numpy as np

import matplotlib.pyplot as plt

import orangecontrib.modelmaps as mm

from itertools import groupby
from operator import itemgetter
from Orange import clustering, data, distance, network, utils
from Orange.orng import orngVizRank as vr
from matplotlib.patches import Circle

ROOT = "/home/miha/work/res/modelmaps"
#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"

def build_radviz(DATASET, ROOT, n_attributes):
    outname = os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s.bz2" % (n_attributes, DATASET, sys.platform))
    if os.path.exists(outname) and os.path.isfile(outname):
        return

    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))
    if not (os.path.exists(fname) and os.path.isfile(fname)):
        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))

        if not (os.path.exists(fname) and os.path.isfile(fname)):
            raise IOError("File %s not found." % fname)

    build_map = mm.BuildModelMap(fname)

    print "get features..."
    if n_attributes == 3:
        features = sorted(mm.get_feature_subsets(build_map.data().domain, min_features=3, max_features=3))
    elif n_attributes == 4:
        features = mm.get_feature_subsets(build_map.data().domain, min_features=4, max_features=4)

        if len(features) > 3000:
            selectors = [1] * 3000 + [0] * (len(features) - 3000)
            random.shuffle(selectors)
            features = list(itertools.compress(features, selectors))

        for i in range(len(features)):
            f = features[i]
            features.append([f[1], f[0], f[2], f[3]])
            features.append([f[0], f[2], f[1], f[3]])
    else:
        raise AttributeError("Only Radviz on 3 or 4 features supported.")

    features.sort()
    print "build %d models..." % len(features)
    models = [build_map.build_projection_model(attrs, vr.RADVIZ) for attrs in features]
    print "build model data..."
    table = build_map.build_model_data(models)

    smx = build_map.build_model_matrix(models)
    mm.save(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s" % (n_attributes, DATASET, sys.platform)), smx, table, build_map.data())

def radviz_in_vr_mm_4(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, iterative_clustering=False, linkage=None):
    return radviz_in_vr_mm(smx, table, original_data, DATASET, ROOT, clusters, seed, n_attributes=4, iterative_clustering=iterative_clustering, linkage=linkage)

def radviz_in_vr_mm(smx, table, original_data=None, DATASET=None, ROOT=None, clusters=None, seed=0, n_attributes=3, iterative_clustering=False, linkage=None):

    # VIZRANK

    def save_figure(model_instances, method, clustering_type=""):
        fig = plt.figure(figsize=(6, 9), dpi=300)
        fig.subplots_adjust(wspace=0.3, hspace=0.2, top=0.9, bottom=0.01, left=0, right=0.90)

        for i, inst in enumerate(model_instances[:6]):
            add_subplot(fig, inst, i=(i + 1))

        plt.figtext(0.5, 0.965,  r"%s: %s" % (method, DATASET), ha='center', color='black', weight='bold', size='large')
        plt.savefig(os.path.join(ROOT, "_projections_", "radviz_%d_%s_%s_%s.pdf" %
            (n_attributes, DATASET, method.lower().replace(" ", ""), "_".join(clustering_type.split(" ")))))

    def add_subplot(fig, model_instance, i=1):
        graph = data.preprocess.scaling.ScaleLinProjData()
        graph.normalizeExamples = 1
        graph.setData(original_data, graph.rawSubsetData)

        attr_indices = [graph.attribute_name_index[attr] for attr in model_instance["attributes"].value.split(", ")]
        class_list = graph.original_data[graph.data_class_index]

        validData = graph.getValidList(attr_indices)
        transProjData = graph.createProjectionAsNumericArray(attr_indices, validData=validData, normalize=graph.normalizeExamples, jitterSize=0.1, useAnchorData=1, removeMissingData=0)
        projData = transProjData.T

        ax = fig.add_subplot(3, 2, i)

        ax.scatter(projData[0], projData[1], c=class_list, s=50., alpha=0.75)

        xanchors = graph.create_xanchors(len(attr_indices))
        yanchors = graph.create_yanchors(len(attr_indices))

        ax.add_artist(Circle((0., 0.), radius=1, edgecolor="black", facecolor="none", alpha=0.75))
        ax.scatter(xanchors, yanchors, c="black", s=50., alpha=0.75)
        for x, y, attr in zip(xanchors, yanchors, model_instance["attributes"].value.split(", ")):
            ax.text(x*1.06, y*1.18-0.04, attr, size="x-small")


        ax.set_axis_off()
        ax.set_xlim(-1.2, 1.2)
        ax.set_ylim(-1.2, 1.2)

        ax.set_title(r"$\overline{P}=%.2f$" % (model_instance["P"].value * 100), weight='bold', size='medium', position=(0.5,   1.01),
                        horizontalalignment='center', verticalalignment='center')

    vr_models = sorted((ex for ex in table), key=lambda x: x["P"].value, reverse=True)

    if DATASET is not None and ROOT is not None:
        save_figure(vr_models, "VizRank")


    # MODEL MAP

    class ModelDistanceConstructor(distance.DistanceConstructor):

        def __new__(cls, data=None):
            self = distance.DistanceConstructor.__new__(cls)
            return self.__call__(data) if data else self

        def __call__(self, table):
            return ModelDistance()

    class ModelDistance(distance.Distance):
        def __call__(self, e1, e2):
            return mm.distance_manhattan(e1["model"].value, e2["model"].value)

    def data_center(table):
        onemodel = table[0]["model"].value
        model = mm.Model("RADVIZ", None,
            np.mean(np.array([ex["model"].value.probabilities for ex in table]), axis=0),
            onemodel.class_values, [],
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_predictions),
            [onemodel.class_values.keys()[0]] * len(onemodel.instance_classes))

        return model.get_instance(table.domain)

    table.random_generator = seed
    clustering.kmeans.data_center = data_center

    if clusters is not None and linkage:
        clustering_type = "hierarchical %s" % str(linkage).lower()
        root = clustering.hierarchical.clustering(table, distance_constructor=ModelDistanceConstructor, linkage=linkage)

        best_projs = []
        for c in range(2, clusters + 1):
            topmost = sorted(clustering.hierarchical.top_clusters(root, c), key=len)

            best_projs_tmp = []
            for n, cluster in enumerate(topmost):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for i in cluster), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

    elif clusters is not None and iterative_clustering:
        clustering_type = "kmeans iterative"
        table.shuffle()
        best_projs = []
        for c in range(2, clusters + 1):
            kmeans = clustering.kmeans.Clustering(table, centroids=c, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

            best_projs_tmp = []
            for k, g in groupby(clusters_, key=itemgetter(0)):
                best_projs_tmp.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

            best_projs_tmp.sort(key=itemgetter(0), reverse=True)
            best_projs.extend([best_proj for best_proj in best_projs_tmp if best_proj not in best_projs])

            if len(best_projs) >= clusters:
                break

        best_projs = best_projs[:clusters]
    else:
        if clusters is None:
            clustering_type = "community detection"
            graph = network.Graph()
            graph.add_nodes_from(range(smx.dim))
            graph.set_items(table)
            graph.nodes_iter()

            k = int(math.sqrt(smx.dim))
            edge_list = network.GraphLayout().edges_from_distance_matrix(smx, -1, -1, k)
            graph.add_edges_from(((u, v, {'weight':1 - d}) for u, v, d in edge_list))

            clusters_ = network.community.label_propagation(graph, iterations=10000, seed=seed)
            print "Clusters:", len(set(clusters_.values()))
            clusters_ = sorted(((j,i) for i,j in clusters.iteritems()), key=itemgetter(0))

        else:
            clustering_type = "kmeans"
            table.shuffle()
            kmeans = clustering.kmeans.Clustering(table, centroids=clusters, distance=ModelDistanceConstructor, initialization=clustering.kmeans.init_diversity)
            clusters_ = sorted(zip(kmeans.clusters, range(len(kmeans.clusters))), key=itemgetter(0))

        best_projs = []
        for k, g in groupby(clusters_, key=itemgetter(0)):
            best_projs.append(max(((table[i]["P"].value, i) for c, i in g), key=itemgetter(0)))

        best_projs.sort(key=itemgetter(0), reverse=True)

    mm_models = [table[key] for score, key in best_projs]

    if DATASET is not None and ROOT is not None:
        save_figure(mm_models, "Model Map", clustering_type)

    return vr_models, mm_models
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.