orange-modelmaps / examples / projections / distance_metric_comparisson.py

__author__ = 'Miha Stajdohar'

import cPickle as pickle
import os, os.path, sys
import scipy
import numpy as np
import _modelmaps as mm

from time import time
from Orange.orng import orngVizRank as vr
from Orange import utils

ROOT = "/home/miha/work/res/modelmaps"
#ROOT = "/Network/Servers/xgridcontroller.private/lab/mihas/modelmaps"
ROOT = "C:\\Users\\Miha\\work\\res\\modelmaps"

def build_map_for_metric_comparisson(DATASET, N):
    print "DATA SET: %s" % DATASET

    fname = os.path.join(utils.environ.dataset_install_dir, "%s%s" % (DATASET, ".tab"))

    if not (os.path.exists(fname) and os.path.isfile(fname)):
        fname = os.path.join(ROOT, "tab", "%s%s" % (DATASET, ".tab"))

        if not (os.path.exists(fname) and os.path.isfile(fname)):
            raise IOError("File %s not found." % fname)

    build_map = mm.BuildModelMap(fname)

    nfeatures = len(build_map.data_d.domain.features)
    features = mm.get_feature_subsets(build_map.data_d.domain, N, min_features=3, max_features=8)

    max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
    features_scatterplot = mm.get_feature_subsets_scatterplot(build_map.data_d.domain, max_nfeatures_scatterplot)

    models = []
    models.extend([build_map.build_projection_model(f, vr.LINEAR_PROJECTION) for f in features])
    models.extend([build_map.build_projection_model(f, vr.RADVIZ) for f in features])
    models.extend([build_map.build_projection_model(f, vr.POLYVIZ) for f in features])
    models.extend([build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot])

    table = build_map.build_model_data(models)

    smxs = {}
    s = time()
    smxs["5.1"] = build_map.build_model_matrix(models, mm.distance_class)
    print (time() - s) / 6
#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)

    s = time()
    smxs["5.3"] = build_map.build_model_matrix(models, mm.distance_euclidean)
    print (time() - s) / 60
#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)

    s = time()
    smxs["5.4"] = build_map.build_model_matrix(models, mm.distance_manhattan)
    print (time() - s) / 60
#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)

    s = time()
    smxs["5.5"] = build_map.build_model_matrix(models, mm.distance_rank)
    print (time() - s) / 60
#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)

    s = time()
    smxs["5.2"] = build_map.build_model_matrix(models, mm.distance_mi)
    print (time() - s) / 60
#    mm.save(os.path.join(ROOT, "_projections_", "proj_all_%s_%d_%s" % (DATASET, N, sys.platform)), smxs, table, build_map.data_d)

#    r_file = os.path.join(ROOT, "_projections_", "vals_%s.pkl" % sys.platform)
#    if os.path.exists(r_file) and os.path.isfile(r_file):
#        res = pickle.load(open(r_file, "rb"))
#    else:
#        res = {}
#
#    res.update({"%s %s" % (DATASET, key): smxs[key].get_values() for key in smxs})
#    pickle.dump(res, open(r_file, "wb"), -1)

    return smxs

def matrix_correlation(smxs):
    keys = sorted(smxs.keys())
    res = {k1: {k2: {} for k2 in keys} for k1 in keys}
    for i in range(len(smxs)):
        for j in range(i + 1):
            r = {}
            smx1 = smxs[keys[i]]
            smx2 = smxs[keys[j]]

            r["rank"], r["rank p"] = np.average([scipy.stats.spearmanr(smx1[n], smx2[n]) for n in range(smx1.dim)], axis=0)

            res[keys[i]][keys[j]] = r
            res[keys[j]][keys[i]] = r

    #pickle.dump(res, open(os.path.join(ROOT, "_projections_", "compare_distances.pkl"), "wb"), -1)

    print "rank"
    for i in range(len(smxs)):
        print keys[i], "  ",
        print "  ".join(["%s: %lf" % (keys[j], res[keys[i]][keys[j]]["rank"]) for j in range(i)])
    print
    print "rank p"
    for i in range(len(smxs)):
        print keys[i], "  ",
        print "  ".join(["%s: %e" % (keys[j], res[keys[i]][keys[j]]["rank p"]) for j in range(i)])

#smxs = build_map_for_metric_comparisson("zoo", N=1000)
#smxs, table, data = mm.load(os.path.join(ROOT, "_projections_", "proj_alldist_4_zoo_1000"))
#matrix_correlation(smxs)

#smxs = build_map_for_metric_comparisson("breast-cancer-wisconsin", N=501)
#matrix_correlation(smxs)

#smxs = build_map_for_metric_comparisson("wine", N=1000)
#matrix_correlation(smxs)

#smxs = build_map_for_metric_comparisson("voting", N=1000)
#matrix_correlation(smxs)

#smxs = build_map_for_metric_comparisson("vehicle", N=1000)
#matrix_correlation(smxs)

#smxs = build_map_for_metric_comparisson("iris", N=10)
#matrix_correlation(smxs)

#smxs = build_map_for_metric_comparisson("heart_disease", N=1000)
#matrix_correlation(smxs)

smxs = build_map_for_metric_comparisson("dermatology", N=1000)
matrix_correlation(smxs)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.