Commits

Miha Stajdohar committed 893b6fb

Moved around some files.

  • Participants
  • Parent commits fa20699

Comments (0)

Files changed (22)

archive/addModel.py

+import pickle
+
+from tools import *
+
+mergedFile = ROOT + "dst/breast-allmodels-468"
+modelFile = ROOT + "dst/breast-svms-510"
+model_ratio = 0.0
+model_knn = 4
+
+merged_res = pickle.load(open("%s.res" % mergedFile, 'rb'))
+merged_smx, merged_labels, merged_data = OWDistanceFile.readMatrix("%s.dst" % mergedFile)
+merged_smx.items = orange.ExampleTable("%s.tab" % mergedFile)
+
+model_res = pickle.load(open("%s.res" % modelFile, 'rb'))
+model_smx, model_labels, model_data = OWDistanceFile.readMatrix("%s.dst" % modelFile)
+model_smx.items = orange.ExampleTable("%s.tab" % modelFile)
+model_net = matrix2network(model_smx, model_ratio, model_knn)
+model_net.items = merged_smx.items
+
+median_matrix, medians, csizes, bests = cluster2matrix(model_net, model_smx)
+
+for j, m in enumerate(medians):
+    #vizrs[i][m][5]['Method'] = methods[i]
+    #models.append(methods[i])
+    
+    ex = orange.Example(merged_smx.items.domain)
+    ex["uuid"] = nets[i].items[m]["uuid"].value
+    ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
+    ex["CA"] = nets[i].items[m]["CA"].value
+    ex["AUC"] = nets[i].items[m]["AUC"].value
+    ex["cluster CA"] = best_indices[i][j]
+    ex["attributes"] = nets[i].items[m]["attributes"].value
+    ex["model"] = nets[i].items[m]["model"].value
+    ex["cluster size"] = median_csizes[i][j]
+    merged_smx.items.append(ex)
+    
+merged_vizr_res.extend([vizrs[i][m] for m in medians])
+merged_proj_points.extend([projs_points[i][m] for m in medians])
+results.extend([vizrs[i][m][5].get("Results").results for m in medians])

archive/bestincluster.py

+import orange
+import OWDistanceFile
+import orngClustering
+
+smx, lbl, data = OWDistanceFile.readMatrix(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-500-tau.dst')
+data = orange.ExampleTable(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-500-tau.tab')
+
+for i in range(smx.dim):
+    for j in range(i):
+        if smx[i,j] < 0:
+            smx[i,j] = 0
+            
+root = orange.HierarchicalClustering(smx, linkage=orange.HierarchicalClustering.Complete)
+
+def printClustering2(cluster):
+    if cluster.branches:
+        return "(%s%s)" % (printClustering2(cluster.left), printClustering2(cluster.right))
+    else:
+        return str(tuple(cluster))
+
+def prune(cluster, togo):
+    if cluster.branches:
+        if togo<0:
+            cluster.branches = None
+        else:
+            for branch in cluster.branches:
+                prune(branch, togo - cluster.height)
+
+#prune(root, 2)
+#printClustering2(root)
+nclusters = 20                
+clustered = orngClustering.hierarhicalClustering_topClustersMembership(root, nclusters)
+
+l = {}
+for ndx, c in enumerate(clustered):
+    l[c] = l[c] + [ndx] if c in l else [ndx]
+
+bestincluster = []
+for i, cluster in l.items():
+    best_val = 0
+    best_ndx = -1
+    
+    if len(cluster) < 5:
+        continue
+
+    for c in cluster:
+        if float(data[c]['vizrank']) > best_val:
+            best_val = float(data[c]['vizrank'])
+            best_ndx = c
+            
+    #print best_ndx, best_val
+    bestincluster.append(best_ndx)
+
+dim = len(bestincluster)    
+newsmx = orange.SymMatrix(dim)
+for i in range(dim):
+    for j in range(i):
+        newsmx[i,j] = smx[bestincluster[i], bestincluster[j]]
+
+newsmx.items = data.getitems(bestincluster)
+
+def saveSymMatrix(matrix, file):
+    fn = open(file + ".dst", 'w')
+    fn.write("%d labeled\n" % matrix.dim)
+    
+    for i in range(matrix.dim):
+        fn.write("%s" % matrix.items[i]['label'])
+        for j in range(i+1):
+            fn.write("\t%.6f" % matrix[i,j])
+        fn.write("\n")
+        
+    fn.close()
+    matrix.items.save(file + ".tab")
+
+saveSymMatrix(newsmx, r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-bestinclust-20-tau') 
+    

archive/build_astra_map.py

+import Orange
+import orngVizRank as vr
+
+from tools import *
+from build_model_map import save_models, models2matrix, build_projection_model
+
+FOLDS = 10
+MODEL_LIMIT = 3000
+
+#data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/639_500FPRDK.tab")
+
+indices = Orange.core.MakeRandomIndicesCV(data_d, FOLDS, randseed=0, stratified=Orange.core.MakeRandomIndices.StratifiedIfPossible)
+##
+#attributes  = getRandomAttributeSubsets(data_d.domain, MODEL_LIMIT)
+#attributes += [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+##
+##attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-420.tab') if ex['model'].value != 'SCATTERPLOT']
+##attributes = set(attributes)
+##attributes = [attr.split(', ') for attr in attributes]
+##
+##
+models = []
+scatterplot_attributes = []
+for i in range(len(data_d.domain.attributes)):
+    for j in range(i):
+        scatterplot_attributes.append([data_d.domain.attributes[i].name, data_d.domain.attributes[j].name])
+
+print "attributes:", len(data_d.domain.attributes)
+print "attribute combinations:", len(scatterplot_attributes)
+random.shuffle(scatterplot_attributes)
+models.extend([build_projection_model(data_d, attrs, indices, vr.SCATTERPLOT) for attrs in scatterplot_attributes[:MODEL_LIMIT]])
+
+#for projection_type in [vr.LINEAR_PROJECTION, vr.RADVIZ, vr.POLYVIZ]:
+#    models.extend([build_projection_model(data_d, attrs, indices, projection_type) for attrs in attributes])
+
+models = [model for model in models if model is not None]
+smx_rank = models2matrix(models)
+
+save_models(models, smx_rank, '%s-%d' % (OUT_FILE, len(smx_rank)))

archive/build_ensemble_map.py

+import Orange
+
+from tools import *
+from build_model_map import save_models, models2matrix
+
+#FOLDS = 10
+MODEL_LIMIT = 5000
+
+#data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/dermatology.tab")
+
+
+def get_attributes(node):
+    atts = []
+    if node.branchSelector:
+        a = node.branchSelector.classVar.name
+        atts.append(a)
+        for i in range(len(node.branches)):
+            if node.branches[i]:
+                atts.extend(get_attributes(node.branches[i]))
+    return atts
+
+def build_rf_models(data):
+    
+    tree = Orange.classification.tree.TreeLearner(storeNodeClassifier = 1, 
+                   storeContingencies=0, storeDistributions=1, minExamples=5, 
+                   storeExamples=1).instance()
+    gini = Orange.feature.scoring.Gini()
+    tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+    tree.maxDepth = 5
+    tree.split = Orange.ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
+    forestLearner = Orange.ensemble.forest.RandomForestLearner(learner=tree, trees=MODEL_LIMIT)
+    forestClassifier = forestLearner(data)
+    
+    models = []
+    for classifier in forestClassifier.classifiers:
+        probabilities, instance_predictions, instance_classes = [], [], []
+        for i in range(len(data)):
+            
+                ex = Orange.data.Instance(data[i])
+                ex.setclass("?")
+                cr = classifier(ex, Orange.core.GetBoth)
+                if cr[0].isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+                
+                probabilities.append(numpy.array(list(cr[1])))
+                instance_predictions.append(cr[0])
+                instance_classes.append(data[i].get_class())
+                
+        models.append({'method' : 'TREE', 
+                       'classifier' : classifier, 
+                       'probabilities' : probabilities, 
+                       'YAnchors' : None, 
+                       'XAnchors' : None, 
+                       'attributes': list(set(get_attributes(classifier.tree))),
+                       'instance_predictions' : instance_predictions,
+                       'instance_classes' : instance_classes})        
+    return models
+
+models = build_rf_models(data_d)
+smx_rank = models2matrix(models)
+save_models(models, smx_rank, '%s-%d' % (OUT_FILE, len(smx_rank)))

archive/classifier2matrix.py

+import time
+import uuid
+import pickle
+import numpy
+
+import orange
+import orngTree
+import orngEnsemble
+import orngTest
+import orngStat
+
+from tools import *
+
+def getForestAttributes(node):
+    atts = []
+    if node.branchSelector:
+        a = node.branchSelector.classVar.name
+        atts.append(a)
+        for i in range(len(node.branches)):
+            if node.branches[i]:
+                atts.extend(getForestAttributes(node.branches[i]))
+    return atts
+
+def getAttributes(classifier):
+    if type(classifier).__name__ == "TreeClassifier":
+        return getForestAttributes(classifier.tree)
+    else:
+        return [var.name for var in classifier.domain.attributes]
+
+def classifier2matrix(data, method, classifiers, fn=None, labels=None):
+    results = [orngTest.testOnData([c], data) for c in classifiers]
+
+    cv = data.domain.classVar.name
+    resultsByClass = [[orngTest.testOnData([c], data.filter({cv : val})) for val in data.domain.classVar.values] for c in classifiers]
+    
+    out = getModelsExampleTable()
+    
+    model_classprobs = []
+    model_predictprobs = []
+    for i, result in enumerate(results):
+        model_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in result.results]))
+        model_predictprobs.append([numpy.array(res.probabilities[0]) for res in result.results])
+        attributes = list(set(getAttributes(classifiers[i])))
+        ex = orange.Example(out.domain)
+        ex['uuid'] = uuid.uuid4().hex
+        ex['model'] = MODEL_LIST[method]
+        ex['attributes'] = ", ".join(sorted(attributes))
+        ex['number of attributes'] = len(attributes)
+        ex['CA'] = orngStat.CA(result)[0]
+        ex['AUC'] = orngStat.AUC(result)[0]
+        ex['CA by class'] = ", ".join([str(orngStat.CA(res)[0]) for res in resultsByClass[i]])
+        ex['label'] = labels[i] if labels else MODEL_LIST[method]
+        out.append(ex)
+        
+    ##########################################################################
+    ## calculate projection distance matrices
+    print 'calculating model distance matrices,', len(model_classprobs), 'models'
+    dim = len(model_classprobs)
+    smx_class = orange.SymMatrix(dim)
+    smx_probs = orange.SymMatrix(dim)
+    
+    counter = 0
+    time_start = time.time()
+    
+    for i in range(dim):
+        for j in range(i+1, dim):
+            smx_class[i,j] = numpy.sum(numpy.power(model_classprobs[i] - model_classprobs[j], 2))
+            # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
+            smx_probs[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(model_predictprobs[i],model_predictprobs[j])])
+                                         
+            counter += 1
+            if counter % 5000 == 0:
+                time_elapsed = time.time() - time_start
+                time_total = time_elapsed / counter * dim * (dim - 1) / 2
+                time_remainng = int(time_total - time_elapsed)
+                print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+                
+    RV = ([method for ex in out], [ex["uuid"].value for ex in out], [r.results for r in results], \
+          [None for ex in out], classifiers, [ex["attributes"].value.split(', ') for ex in out])
+    
+    if fn:
+        #saveSymMatrix(smx_class, root + out_file + "-" + "tree" + '-' + str(dim) + '-class', out)
+        saveSymMatrix(smx_probs, '%s-%d' % (fn, dim), out)
+        out.save('%s-%d.tab' % (fn, dim))
+        
+        for i, ex in enumerate(out):
+            if str(ex["model"].value) == "SVM":
+                classifiers[i] = None
+        
+        pickle.dump(dict(zip([ex["uuid"].value for ex in out], \
+                             zip([method for ex in out], [r.results for r in results], \
+                                 [None for ex in out], classifiers, \
+                                 [ex["attributes"].value.split(', ') for ex in out]))), \
+                                 open('%s-%d.res' % (fn, dim), 'wb'))
+    
+    smx_probs.items = out
+    return smx_probs, RV

archive/clusterModels.py

+import os.path
+import pickle
+
+from tools import *
+from matrix2network import *
+
+
+def clusterModel(fn, knn, ratio=0.0):
+    print "CLUSTERING:", os.path.split(fn)[1]
+    smx = loadModel(fn)
+    net = matrix2network(smx, ratio, knn)
+    net.items = smx.items
+    msmx = cluster2matrix2(net, smx)
+    saveModel(msmx, "%s-clustered" % fn)
+
+#clusterModel(ROOT + "dst/breast-knn-510"    , 2)
+#clusterModel(ROOT + "dst/breast-tree-500"   , 1)
+#clusterModel(ROOT + "dst/breast-bayes-510"  , 2)
+#clusterModel(ROOT + "dst/breast-svms-510"   , 4)
+#clusterModel(ROOT + "dst/breast-polyviz-501", 2)
+#clusterModel(ROOT + "dst/breast-radviz-501" , 2)
+
+#clusterModel(ROOT + "dst/breast-linproj-501", 2)
+clusterModel(ROOT + "dst/zoo-rf-1000"    , 1)

archive/compare_matrices.py

+#import orange
+import OWDistanceFile
+#import scipy.stats
+
+smx1, lbl1, data1 = OWDistanceFile.readMatrix(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-10-spearman.dst')
+smx2, lbl2, data2 = OWDistanceFile.readMatrix(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-10-pearson.dst')
+k = 5
+c = []
+taus = []
+for i in range(smx1.dim):
+    neighbours1 = [(smx1[i,j], j) for j in range(smx1.dim) if i != j]
+    knn1 = set([b for a,b in sorted(neighbours1)[:k]])
+    neighbours2 = [(smx2[i,j], j) for j in range(smx2.dim) if i != j]
+    knn2 = set([b for a,b in sorted(neighbours2)[:k]])
+    #c.append(len(knn1.intersection(knn2)) / float(len(knn1.union(knn2))))
+    c.append(len(knn1.intersection(knn2)) / float(k))
+    
+
+        
+print sum(c) / float(len(c))

archive/forest2matrix.py

+import time
+import uuid
+import pickle
+import numpy
+
+import orange
+import orngTree
+import orngEnsemble
+import orngTest
+import orngStat
+
+from tools import *
+
+root = "C:\\Users\\miha\\Projects\\res\\metamining\\"
+#root = "/home/miha/metamining/"
+out_file = 'dst/primary'
+method = 5
+data = orange.ExampleTable(root + 'tab/primary-c.tab')
+TREE_LIMIT = 15
+
+tree = orngTree.TreeLearner(storeNodeClassifier = 0, storeContingencies=0, \
+  storeDistributions=1, minExamples=5, storeExamples=1).instance()
+gini = orange.MeasureAttribute_gini()
+tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+tree.maxDepth = 4
+tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)
+
+forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
+forest = forestLearner(data)
+
+results = [orngTest.testOnData([c], data) for c in forest.classifiers]
+
+out = getModelsExampleTable()
+
+def getAttributes(node):
+    atts = []
+    if node.branchSelector:
+        a = node.branchSelector.classVar.name
+        atts.append(a)
+        for i in range(len(node.branches)):
+            if node.branches[i]:
+                atts.extend(getAttributes(node.branches[i]))
+    return atts
+
+model_classprobs = []
+model_predictprobs = []
+for i, result in enumerate(results):
+    model_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in result.results]))
+    model_predictprobs.append([numpy.array(res.probabilities[0]) for res in result.results])
+    attributes = list(set(getAttributes(forest.classifiers[i].tree)))
+    ex = orange.Example(out.domain)
+    ex['uuid'] = uuid.uuid4().hex
+    ex['model'] = MODEL_LIST[method]
+    ex['attributes'] = ", ".join(attributes)
+    ex['number of attributes'] = len(attributes)
+    ex['score'] = orngStat.CA(result)[0]
+    out.append(ex)
+    
+##########################################################################
+## calculate projection distance matrices
+print 'calculating projection distance matrices,', len(model_classprobs), 'models'
+dim = len(model_classprobs)
+smx_class = orange.SymMatrix(dim)
+smx_probs = orange.SymMatrix(dim)
+
+counter = 0
+time_start = time.time()
+
+for i in range(dim):
+    for j in range(i+1, dim):
+        smx_class[i,j] = numpy.sum(numpy.power(model_classprobs[i] - model_classprobs[j], 2))
+        # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
+        smx_probs[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(model_predictprobs[i],model_predictprobs[j])])
+                                     
+        counter += 1
+        if counter % 500 == 0:
+            time_elapsed = time.time() - time_start
+            time_total = time_elapsed / counter * dim * (dim - 1) / 2
+            time_remainng = int(time_total - time_elapsed)
+            print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+    
+saveSymMatrix(smx_class, root + out_file + "-" + "tree" + '-' + str(dim) + '-class', out)
+saveSymMatrix(smx_probs, root + out_file + "-" + "tree" + '-' + str(dim) + '-prob', out)
+out.save(root + out_file + "-" + "tree" + '-' + str(dim) + '.tab')
+output = open(root + out_file + "-" + "tree" + '-' + str(dim) + '.res', 'wb')
+pickle.dump((method, [ex["uuid"].value for ex in out], results, None, forest.classifiers), output)
+output.close()

archive/knn2matrix.py

+from tools import *
+from classifier2matrix import *
+
+print "kNN: calculating classifiers"
+
+data_d = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+method = 7
+
+distanceConstructors = [orange.ExamplesDistanceConstructor_Euclidean(),
+                        orange.ExamplesDistanceConstructor_Manhattan(),
+                        orange.ExamplesDistanceConstructor_Hamming(),
+                        orange.ExamplesDistanceConstructor_Maximal()]
+ks = [5, 9, 15]
+
+print 'reading results'
+modelData = orange.ExampleTable(ROOT + "dst/breast-allmodels-283.tab")
+CAs, attributes = zip(*sorted([(ex['CA'].value, ex['label'].value.split(', ')) for ex in modelData])[-50:])
+
+print 'constructing classifiers'
+classifiers = []
+labels = []
+for atts in attributes:
+    for k in ks:
+        exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+        data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+        
+        for distanceConstructor in distanceConstructors:
+            knn = orange.kNNLearner()
+            knn.k = k
+            knn.distanceConstructor = distanceConstructor
+            classifiers.append(knn(data2))
+            labels.append('KNN %s k=%d' % (str(distanceConstructor).split(' ')[0][29:], k))
+
+print 'classifiers to matrix'
+smx, RV = classifier2matrix(data_d, method, classifiers, OUT_FILE + "-knns", labels)
+methods, uuids, res, projections, classifiers, attrs = RV
+

archive/matrix2network.py

+import orange
+import orngNetwork
+import OWDistanceFile
+
+from tools import *
+
+def matrix2network(name, ratio, kNN):
+    if type(name) == type(""):
+        dstFile = name + "-knnpredict.dst"
+        tabFile = name + ".tab"
+        netFile = name + "-knnpredict"
+        smx, labels, data = OWDistanceFile.readMatrix(dstFile)
+        net = orngNetwork.Network(smx.dim, 0)
+        lower, upper = net.getDistanceMatrixThreshold(smx, ratio)
+        net.fromDistanceMatrix(smx, 0, upper, kNN, 0)
+        net.items = orange.ExampleTable(tabFile)
+        net.save(netFile)
+    else:
+        smx = name
+        net = orngNetwork.Network(smx.dim, 0)
+        lower, upper = net.getDistanceMatrixThreshold(smx, ratio)
+        net.fromDistanceMatrix(smx, 0, upper, kNN, 0)
+        #net.items = smx.items.getitems()
+        
+    return net
+
+#net_linproj     = matrix2network("primary-linproj-494"    , 0.01, 1)
+#net_polyviz     = matrix2network("primary-polyviz-494"    , 0.01, 1)
+#net_radviz      = matrix2network("primary-radviz-494"     , 0.01, 1)
+#net_scatterplot = matrix2network("primary-scatterplot-253", 0.00, 1)
+
+#smx, labels, data = OWDistanceFile.readMatrix(dstroot + "primary-scatterplot-253-knnpredict.dst")
+#net = orngNetwork.Network(smx.dim, 0)
+#lower, upper = net.getDistanceMatrixThreshold(smx, 0.05)
+#print upper
+
+#############################################################################
+# best in cluster 2 network
+
+def cluster2matrix(net, name):
+    if type(name) == type(""):
+        dstFile = name + "-knnpredict.dst"
+        smx, labels, data = OWDistanceFile.readMatrix(dstFile)
+    else:
+        smx = name
+    lbls = net.clustering.labelPropagation()
+    clusters = set(lbls)
+    medians = []
+    csizes = []
+    bests = []
+    for c in clusters:
+        cndxs = [i for i, ci in enumerate(lbls) if ci == c]
+        cmatrix = smx.getitems(cndxs)
+        cdsts  = zip([sum([j for j in i]) for i in cmatrix], cndxs, [net.items[i]['CA'].value for i in cndxs])
+        max_score = max([net.items[i]['CA'].value for i in cndxs])
+        cmedian = min(cdsts)[1]
+        medians.append((cmedian, max_score, len(cndxs)))
+
+    medians.sort()
+    medians, bests, csizes = map(list, zip(*medians))
+    medianmatrix = smx.getitems(medians)
+    medianmatrix.items = net.items.getitems(medians)
+    if type(name) == type(""):
+        saveSymMatrix(medianmatrix, dstroot + "medians-" + name + "-" + str(medianmatrix.dim) + "-knnpredict", None, True)
+    return medianmatrix, medians, csizes, bests
+
+def cluster2matrix2(net, smx):
+    lbls = net.clustering.labelPropagation()
+    clusters = set(lbls)
+    medians = []
+    csizes = []
+    bests = []
+    for c in clusters:
+        cndxs = [i for i, ci in enumerate(lbls) if ci == c]
+        cmatrix = smx.getitems(cndxs)
+        cdsts  = zip([sum([j for j in i]) for i in cmatrix], cndxs, [net.items[i]['CA'].value for i in cndxs])
+        max_score = max([net.items[i]['CA'].value for i in cndxs])
+        cmedian = min(cdsts)[1]
+        medians.append((cmedian, max_score, len(cndxs)))
+
+    medians.sort()
+    medians, bests, csizes = map(list, zip(*medians))
+    medianmatrix = smx.getitems(medians)
+    medianmatrix.items = net.items.getitems(medians)
+    medianmatrix.results = {}
+    
+    for i in range(len(medianmatrix.items)):
+        medianmatrix.items[i]["cluster size"] = csizes[i]
+        medianmatrix.items[i]["cluster CA"] = bests[i]
+        uuid = medianmatrix.items[i]["uuid"].value
+        medianmatrix.results[uuid] = smx.results[uuid]
+    
+    return medianmatrix
+
+#cluster2matrix(net_linproj,     "primary-linproj-494")
+#cluster2matrix(net_polyviz,     "primary-polyviz-494")
+#cluster2matrix(net_radviz,      "primary-radviz-494")
+#cluster2matrix(net_scatterplot, "primary-scatterplot-253")
+    
+    

archive/mergeModels.py

+import os.path
+import orange
+
+from tools import *
+
+
+input = [ROOT + "dst/breast-scatterplot-36"       ,
+         ROOT + "dst/breast-knn-510-clustered"    ,
+         ROOT + "dst/breast-tree-500-clustered"   ,
+         ROOT + "dst/breast-bayes-510-clustered"  , 
+         ROOT + "dst/breast-svms-510-clustered"   , 
+         ROOT + "dst/breast-polyviz-501-clustered", 
+         ROOT + "dst/breast-radviz-501-clustered" , 
+         ROOT + "dst/breast-linproj-501-clustered"]
+
+models = []
+
+for fn in input:
+    print "READING:", os.path.split(fn)[1]
+    models.append(loadModel(fn))
+    
+    
+mdata = orange.ExampleTable(models[0].items.domain)
+results = {}
+
+for model in models:
+    mdata.extend(model.items)
+    results.update(model.results)
+    
+# projections have different results than classification models
+vizrResults = [results[ex["uuid"].value][1] if \
+               type(results[ex["uuid"].value][1]) == type([]) else \
+               results[ex["uuid"].value][1][5].get("Results").results for ex in mdata]
+
+smx_class, smx_prob = models2matrix(vizrResults)
+
+smx_prob.items = mdata
+smx_prob.results = results
+
+saveModel(smx_prob, "%sdst/breast-merged-%d" % (ROOT, smx_prob.dim))

archive/model_map_similarity.py

+import numpy
+import Orange
+
+from operator import itemgetter
+from tools import *
+
+print 'loading...'
+
+fileName = 'zoo-1603'
+fileCommon = ROOT + '_explore_/' + fileName
+fileA = fileCommon + '-rank'
+#fileB = ROOT + 'new/zoo-allmodels-420'
+fileB = fileCommon + '-class'
+
+# warning saved matrix in lower-diagonal!
+modelA = numpy.load('%s.npy' % fileA)
+# from lower-diagonal build symmetric 
+modelA = modelA + modelA.transpose()
+itemsA = Orange.data.Table('%s.tab' % fileCommon)
+
+modelB = numpy.load('%s.npy' % fileB)
+modelB = modelB + modelB.transpose()
+#smxB = loadModel(fileB)
+#modelB = numpy.zeros((smxB.dim, smxB.dim))
+#for i in range(smxB.dim):
+#    for j in range(smxB.dim):
+#        modelB[i,j] = smxB[i,j]
+        
+itemsB = Orange.data.Table('%s.tab' %  fileCommon)
+
+def compare_model_similarity(modelA, itemsA, modelB, itemsB):
+    print len(modelA), 'read in model A,', len(modelB), 'read in model B'
+    
+    print 'matching...'
+    matchA = sorted((ex['model'].value + ', '.join(sorted(ex['attributes'].value.split(', '))), i) for i, ex in enumerate(itemsA))
+    matchB = sorted((ex['model'].value + ', '.join(sorted(ex['attributes'].value.split(', '))), i) for i, ex in enumerate(itemsB))
+    
+    i,j = 0,0
+    matches = []
+    # warning! this works only if list values are unique (which they are in my case :)
+    while i < len(matchA) and j < len(matchB):
+        mA, iA = matchA[i]
+        mB, iB = matchB[j]
+    
+        if mA == mB:
+            matches.append((iA,iB))
+            i += 1
+            j += 1 
+        elif mA < mB:
+            i += 1
+        else:
+            j += 1
+    
+    print len(matches), 'matched'
+    indA, indB = zip(*matches)
+    
+    matrixA = modelA.take(indA, axis=0).take(indA, axis=1)
+    matrixB = modelB.take(indB, axis=0).take(indB, axis=1)
+    
+    nnA = numpy.argsort(matrixA)
+    nnB = numpy.argsort(matrixB)
+
+    print 'comparing...'
+    scores = []
+    for k in range(2, len(nnA)+1):
+        nnA_tmp = nnA[:,:k]
+        nnB_tmp = nnB[:,:k]
+        count = 0
+        for i in range(len(nnA)):
+            count += len(set(nnA_tmp[i]).intersection(set(nnB_tmp[i]))) - 1
+
+        scores.append(count / float((k-1)*len(nnA)))
+        if k % 100 == 0:
+            print k
+    return scores
+
+def plt(x,y,fn):
+    import matplotlib.pyplot as plt
+
+    plt.title('')
+    plt.xlabel('k-neighbors')
+    plt.ylabel('similarity')
+    plt.grid(True)
+    
+    plt.plot(x, y, linewidth=1.0)
+    
+    plt.savefig(fn)
+    
+scores = compare_model_similarity(modelA, itemsA, modelB, itemsB)
+
+plt(range(1, len(scores[:50])+1), scores[:50], '%s%s-similarity-50.png' % (ROOT, fileName) )
+plt(range(1, len(scores)+1), scores, '%s%s-similarity.png' % (ROOT, fileName) )
+
+print 'saving results...'
+fp = file(ROOT + 'similarity_results.txt', 'a')
+fp.write('%s-class;%s-prob;%s\n' % (fileName, fileName, ';'.join(str(s) for s in scores)))
+fp.close()

archive/projection_dss.py

+import orange
+import orngClustering
+import OWDistanceFile
+
+root = "c:\\Users\\miha\\Projects\\res\\metamining\\"
+in_file = root + "dst\\zoo-projections-500-abs.dst"
+
+smx, labels, data = OWDistanceFile.readMatrix(in_file)
+data = orange.ExampleTable(root + 'tab\\zoo-projections-500.tab')
+
+# normalize to interval [0,1]
+smx.normalize(0)
+# invert 1 - X
+smx.invert(1)
+
+c = orange.HierarchicalClustering(smx, linkage=orange.HierarchicalClustering.Average)
+depth = 10
+min_projections = 20
+
+clusters = []
+def findProjections(cluster, l):
+    level = l + 1
+    if cluster.branches and level <= depth and len(cluster) > min_projections:
+        findProjections(cluster.left, level)
+        findProjections(cluster.right, level)
+    else:
+        clusters.append(cluster)
+
+findProjections(c, 0)
+
+include = []
+for cluster in clusters:
+    scores = [(data[c]['vizrank'].value, data[c]['number of attributes'].value, c) for c in cluster]
+    scores.sort()
+    include.append(scores[-1][2])
+
+new_smx = orange.SymMatrix(len(include))
+for i in range(new_smx.dim):
+    for j in range(i):
+        new_smx[i,j] = smx[include[i], include[j]]
+new_smx.items = data.getitems(include)
+
+def saveSymMatrix(matrix, file):
+    fn = open(file + ".dst", 'w')
+    fn.write("%d labeled\n" % matrix.dim)
+    
+    for i in range(matrix.dim):
+        fn.write("%s" % matrix.items[i]['label'])
+        for j in range(i+1):
+            fn.write("\t%.6f" % matrix[i,j])
+        fn.write("\n")
+        
+    fn.close()
+    matrix.items.save(file + ".tab")
+
+saveSymMatrix(new_smx, root + 'projections-dss')

archive/projections2matrix.py

+import time
+import math
+import uuid
+import pickle
+
+import numpy
+import scipy.stats
+
+import orange
+import orngClustering
+import orngVizRank as vr
+import orngStat
+import orngTest
+
+from tools import *
+          
+def calculateProjections(data, method, projectionLimit=10, attributes=None):
+    """initialize VizRank and evaluate projections"""
+
+    print "%s: calculating projections" % MODEL_LIST[method]
+    vizr = vr.VizRank(method)
+    if method == vr.LINEAR_PROJECTION:
+        vizr.projOptimizationMethod = 1
+    else:
+        vizr.projOptimizationMethod = 0
+    vizr.setData(data)
+    vizr.projectionLimit = projectionLimit
+    vizr.attributeCount =  9
+    vizr.storeEachPermutation = 1
+    vizr.optimizationType = vr.MAXIMUM_NUMBER_OF_ATTRS
+    vizr.saveEvaluationResults = 1
+    vizr.attrCont = vr.CONT_MEAS_NONE
+    #vizr.attrCont = vr.CONT_MEAS_S2NMIX
+    vizr.attrDisc = vr.DISC_MEAS_NONE
+    vizr.attrSubsetSelection = vr.DETERMINISTIC_ALL
+    
+    if attributes:
+        evaluateProjections(vizr, attributes)
+    else:
+        vizr.evaluateProjections()
+        
+    return vizr
+
+def processProjections(data, vizr_results, projection_points=[], method=None, scaleProjData=None):
+    """calculate projection distance matrices"""
+    
+    if projection_points == [] and scaleProjData == None:
+        print "Error: either projection_points or scaleProjData must be given."
+        return
+    
+    if projection_points != [] and scaleProjData != None:
+        print "Warning: projection_points and scaleProjData both given. scaleProjData will be ignored."
+        scaleProjData = None
+    
+    out = getModelsExampleTable()
+    
+    #preprocess projections
+    attributeset = set()
+    todelete = []
+    print "constructing example table"
+    #projection_distance_matrices = []
+    #projection_distances = []
+    projection_classprobs = []
+    projection_predictprobs = []
+    
+    counter = 0
+    time_start = time.time()
+    for ndx, r in enumerate(vizr_results):
+        lbl = ', '.join(sorted(r[3]))
+        uuid_result = uuid.uuid4().hex
+        nAttributes = len(r[3])
+        settingsDict = r[5]
+        settingsDict['uuid'] = uuid_result
+        if "Method" in settingsDict: method = settingsDict.get("Method") 
+        
+        if not str(method) + lbl in attributeset:
+            attributeset.add(str(method) + lbl)
+        else:
+            print lbl
+            todelete.append(ndx)
+            continue
+        
+        ex = orange.Example(out.domain)
+        #for i in range(len(data.domain.attributes)): ex[i] = "0"
+        #for a in r[3]: ex[a] = "1"
+        ex["CA"] = float(r[0]) / 100
+        ex["attributes"] = lbl
+        ex["number of attributes"] = nAttributes
+        ex['uuid'] = uuid_result
+        ex['model'] = MODEL_LIST[method]
+        ex["label"] = MODEL_LIST[method]
+        results = settingsDict.get("Results")
+        resultsByClass = []
+        for c in range(len(results.classValues)):
+            resTmp = orngTest.ExperimentResults(results.numberOfIterations, 
+                                                results.classifierNames, 
+                                                results.classValues, 
+                                                results.weights, 
+                                                classifiers=results.classifiers, 
+                                                loaded=results.loaded)
+            resTmp.results.extend([res for res in results.results if res.actualClass == c])
+            resultsByClass.append(resTmp)
+            
+        ex['CA by class'] = ", ".join([str(orngStat.CA(res)[0]) for res in resultsByClass])
+        ex["AUC"] = orngStat.AUC(results)[0]
+        out.append(ex)
+        
+        # calculate projection points (x,y) from Data set
+        attributeNameIndex = dict([(data.domain[i].name, i) for i in range(len(data.domain))])
+        attrIndices = [attributeNameIndex[val] for val in r[3]]
+        if scaleProjData == None:
+            x1, y1 = zip(*projection_points[ndx])
+        else:
+            positions = scaleProjData.createProjectionAsExampleTable(attrIndices)
+            x1 = [ex[positions.domain[0]].value for ex in positions]
+            y1 = [ex[positions.domain[1]].value for ex in positions]
+            projection_points.append(zip(x1,y1))
+            
+        x1 = numpy.array(x1)
+        y1 = numpy.array(y1)
+
+        # learner evaluation results
+        projection_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in results.results]))
+        projection_predictprobs.append([numpy.array(res.probabilities[0]) for res in results.results])
+        # calculate distances among points in a projection (distance matrix) 
+        d1 = numpy.sqrt((x1[:,None] - x1[:])**2 + (y1[:,None] - y1[:])**2)
+        #projection_distance_matrices.append(d1)
+        # flatten distance matrix to a vector of projection distances
+        v1 = d1[numpy.triu(numpy.ones((len(x1), len(x1))), 1).astype(bool)]
+        #projection_distances.append(v1)
+        
+        counter += 1
+        if counter % 1000 == 0:
+            time_elapsed = time.time() - time_start
+            time_total = time_elapsed / counter * len(vizr_results)
+            time_remainng = int(time_total - time_elapsed)
+            print counter, '/', len(vizr_results), '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+    
+    # delete duplicated projections
+    todelete.sort(reverse=1)
+    for i in todelete:
+        del vizr_results[i]
+    
+    ##########################################################################
+    ## calculate projection distance matrices
+    print 'calculating projection distance matrices,', len(projection_points), 'projections'
+    dim = len(projection_points)
+    pdim = len(projection_points[0])
+    smx_spearman = orange.SymMatrix(dim)
+    smx_pearson = orange.SymMatrix(dim)
+    smx_tau = orange.SymMatrix(dim)
+    smx_knn_class = orange.SymMatrix(dim)
+    smx_knn_predict = orange.SymMatrix(dim)
+    
+    counter = 0
+    time_start = time.time()
+    for i in range(dim):
+        for j in range(i+1, dim):
+            #v1 = projection_distances[i]
+            #v2 = projection_distances[j]
+            #d1 = projection_distance_matrices[i]
+            #d2 = projection_distance_matrices[j]
+            # spearman correlation
+            #val, prob = scipy.stats.spearmanr(v1, v2)
+            #smx_spearman[i,j] = 1 - max(val, 0)
+            # pearson correlation
+            #val, prob = scipy.stats.pearsonr(v1, v2)
+            #smx_pearson[i,j] = 1 - max(val, 0)
+    #        # kendall tau; !!! too time expensive !!!
+    #        val, prob = scipy.stats.kendalltau(v1, v2)
+    #        smx_tau[i,j] = 1 - max(val, 0)
+    #        taus = []
+    #        for k in range(len(d1)):
+    #            neighbours1 = [(d1[k,l], l) for l in range(len(d1)) if k != l]
+    #            knn1 = [b for a,b in sorted(neighbours1)]
+    #            neighbours2 = [(d2[k,l], l) for l in range(len(d2)) if k != l]
+    #            knn2 = [b for a,b in sorted(neighbours2)]
+    #            taus.append(abs(scipy.stats.kendalltau(knn1, knn2)[0]))
+    #        smx_tau[i,j] = 1 - (sum(taus) / len(taus))
+            # sum((pi_1 - pi_2)^2) - class probability squared error
+            smx_knn_class[i,j] = numpy.sum(numpy.power(projection_classprobs[i] - projection_classprobs[j], 2))
+            # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
+            smx_knn_predict[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(projection_predictprobs[i],projection_predictprobs[j])])
+                                         
+            counter += 1
+            if counter % 5000 == 0:
+                time_elapsed = time.time() - time_start
+                time_total = time_elapsed / counter * dim * (dim - 1) / 2
+                time_remainng = int(time_total - time_elapsed)
+                print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+    
+    return smx_spearman, smx_pearson, smx_knn_class, smx_knn_predict, out, projection_points
+    
+def saveMatrices(smx_spearman, smx_pearson, smx_knn_class, smx_knn_predict, out, method_name):
+    dim = smx_spearman.dim
+    #saveSymMatrix(smx_spearman, root_out + 'dst\\zoo-projections-' + str(dim) + '-spearman')
+    #saveSymMatrix(smx_pearson, root_out + 'dst\\zoo-projections-' + str(dim) + '-pearson')
+    #saveSymMatrix(smx_tau, root_out + 'dst\\zoo-projections-' + str(dim) + '-tau')
+    saveSymMatrix(smx_spearman,    root_out + out_file + "-" + method_name + '-' + str(dim) + '-spearman', out)
+    saveSymMatrix(smx_pearson,     root_out + out_file + "-" + method_name + '-' + str(dim) + '-pearson', out)
+    #saveSymMatrix(smx_tau,         root_out + out_file + '-' + str(dim) + '-kendalltau', out)
+    saveSymMatrix(smx_knn_class,   root_out + out_file + "-" + method_name + '-' + str(dim) + '-knnclass', out)
+    saveSymMatrix(smx_knn_predict, root_out + out_file + "-" + method_name + '-' + str(dim) + '-knnpredict', out)
+    out.save(root_out + out_file + "-" + method_name + '-' + str(dim) + '.tab')
+
+def metamining(data, method, method_name, projectionLimit, fn=None, attributes=None):
+    if method == vr.SCATTERPLOT:
+        vizr = calculateProjections(data, method, projectionLimit)
+    else:
+        vizr = calculateProjections(data, method, projectionLimit, attributes)
+        
+    smx_spearman, smx_pearson, smx_knn_class, smx_knn_predict, out, projection_points = processProjections(data, vizr.results, [], method, vizr.graph)
+    
+    if fn:
+        fn = fn + "-" + method_name + '-' + str(smx_knn_predict.dim)
+        saveSymMatrix(smx_knn_predict, fn + '', out) 
+        out.save(fn + '.tab')
+        pickle.dump(dict(zip([ex["uuid"].value for ex in out], \
+                             zip([method for ex in out], vizr.results, \
+                                 projection_points, [None for ex in out], \
+                                 [ex["attributes"].value.split() for ex in out]))), open(fn + '.res', 'wb'))
+        
+    smx_knn_predict.items = out
+    return vizr, smx_knn_predict, projection_points
+
+#metamining(vr.SCATTERPLOT, "scatterplot")
+#metamining(vr.LINEAR_PROJECTION, "linproj")
+#metamining(vr.RADVIZ, "radviz")
+#metamining(vr.POLYVIZ, "polyviz")

archive/rf2matrix.py

+import pickle
+
+import orange
+import orngTree
+import orngEnsemble
+
+from tools import *
+from classifier2matrix import *
+from matrix2network import *
+
+TREE_LIMIT = 1000
+KNN = 2
+data_d = getData(ROOT + "tab/zoo.tab")
+
+gini = orange.MeasureAttribute_gini()
+
+tree = orngTree.TreeLearner(storeNodeClassifier = 1, \
+                            storeContingencies=0, \
+                            storeDistributions=1, \
+                            minExamples=5, \
+                            storeExamples=1).instance()
+tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+tree.maxDepth = 5
+tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)
+
+forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
+forest = forestLearner(data_d)
+
+smx, RV = classifier2matrix(data_d, 5, forest.classifiers, OUT_FILE + "-rf")
+methods, uuids, res, projections, classifiers, attrs = RV
+#net = matrix2network(smx, 0.0, KNN)
+#net.items = smx.items
+
+smx.results = dict(zip(uuids, zip(methods, res, projections, classifiers, attrs)))
+saveModel(smx, "%sdst/zoo-rf-%d" % (ROOT, smx.dim))

archive/svm2matrix.py

+import orngSVM
+
+from tools import *
+from classifier2matrix import *
+
+print "SVM: calculating classifiers"
+
+data_d = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+method = 8
+
+cs = [0.01, 0.1, 1, 10, 100]
+cs = [1.]
+
+print 'reading results'
+modelData = orange.ExampleTable(ROOT + "dst/breast-bayes-510.tab")
+#attributes = list(set([(ex['CA'].value, ex['label'].value) for ex in modelData]))
+#CAs, attributes = zip(*sorted([(ca, attrs.split(', ')) for ca, attrs in attributes]))
+attributes = [', '.join(sorted(ex['attributes'].value.split(', '))) for ex in modelData]
+print len(attributes)
+attributes.sort()
+attributes = list(set(attributes))
+attributes = [attrs.split(', ') for attrs  in attributes]
+print len(attributes)
+print 'constructing classifiers'
+classifiers = []
+labels = []
+for atts in attributes:
+    for c in cs:
+        exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+        data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+        svm = orngSVM.SVMLearner()
+        #svm.kernelFunc = orngSVM.RBFKernelWrapper(orange.ExamplesDistanceConstructor_Euclidean(data), gamma=0.5)
+        #svm.kernel_type = orange.SVMLearner.Custom
+        svm.gamma = 1. / len(data2)
+        svm.probability = 0
+        classifiers.append(svm(data2))
+        labels.append('SVM c=' + str(c))
+
+print 'classifiers to matrix'
+smx, RV = classifier2matrix(data_d, method, classifiers, OUT_FILE + "-svms", labels)
+methods, uuids, res, projections, classifiers, attrs = RV
+

archive/testkmeans.py

+import orange
+import orngClustering
+
+root = r"C:\Python26\Lib\site-packages\orange\doc\datasets"
+data = orange.ExampleTable(root + "\\brown-selected.tab")
+
+att = ['cdc15 270', 'spo- mid', 'heat 20']
+pp = orange.Preprocessor_select()
+pp.attributes = [data.domain[i] for i in att]
+data_clust = pp(data)
+
+for i in range(1,100):
+    km = orngClustering.KMeans(data_clust, centroids = 3, maxiters = i, initialization = orngClustering.kmeans_init_random, distance = orange.ExamplesDistanceConstructor_Hamming)
+    print km.score
+import pickle
+
+import orange
+import orngTree
+import orngEnsemble
+
+from tools import *
+
+pickle.load(open(ROOT + "dst/breast-knn-510.res", 'rb'))
+
+#
+#m = loadModel(ROOT + "dst/breast-tree-500-clustered") 
+#r0 = m.results[m.items[0]["uuid"].value]
+#c0 = r0[3]
+#
+
+#pickle.dump(c0, file(ROOT + "tree.pck", "wb"))
+
+#r0 = results[results.keys()[0]]
+#method, vizr_result, projection_points, classifier, attrs = r0
+
+#
+#dataL = orange.ExampleTable(ROOT + "dst/breast-linproj-501.tab")
+#dataR = orange.ExampleTable(ROOT + "dst/breast-radviz-501.tab")
+#dataP = orange.ExampleTable(ROOT + "dst/breast-polyviz-501.tab")
+#dataT = orange.ExampleTable(ROOT + "dst/breast-tree-500.tab")
+#dataB = orange.ExampleTable(ROOT + "dst/breast-bayes-510.tab")
+#dataK = orange.ExampleTable(ROOT + "dst/breast-knn-510.tab")
+#
+#def getatt(data):
+#    att = [','.join(sorted(ex['attributes'].value.split(', '))) for ex in data]
+#    return att, set(att)
+#
+#attL, attsL = getatt(dataL)
+#attR, attsR = getatt(dataR)
+#attP, attsP = getatt(dataP)
+#attT, attsT = getatt(dataT)
+#attB, attsB = getatt(dataB)
+#attK, attsK = getatt(dataK)
+#
+#print len(attL), len(set(attL))
+#print len(attR), len(set(attR))
+#print len(attP), len(set(attP))
+#print len(attT), len(set(attT))
+#print len(attB), len(set(attB))
+#print len(attK), len(set(attK))

examples/projections.py

+import orngVizRank as vr
+
+import mm
+
+ROOT = "/home/miha/work/res/modelmap/"
+build_map = mm.BuildModelMap(ROOT + "tab/zoo.tab")
+
+nfeatures = len(build_map.data_d.domain.features)
+features = mm.get_feature_subsets(build_map.data_d.domain, 120)
+
+max_nfeatures_scatterplot = (nfeatures ** 2 - nfeatures) / 2
+features_scatterplot = mm.get_feature_subsets_scatterplot(build_map.data_d.domain, max_nfeatures_scatterplot)
+
+models = []
+models.extend([build_map.build_projection_model(f, vr.LINEAR_PROJECTION) for f in features])
+models.extend([build_map.build_projection_model(attrs, vr.SCATTERPLOT) for attrs in features_scatterplot])
+
+smx = build_map.build_model_matrix(models)
+table = build_map.build_model_data(models)
+
+import math
+import os.path
+import pickle
+import random
+import time
+
+import numpy as np
+
+from orngScaleData import getVariableValuesSorted
+from OWDistanceFile import readMatrix
+
+from Orange import data, feature
+
+from model import *
+from modelmap import *
+
+ROOT = "/home/miha/work/res/metamining/"
+#OUT_FILE = ROOT + "dst/zoo"
+#OUT_FILE = ROOT + "dst/zoo"
+OUT_FILE = ROOT + "_astra_/fprdk"
+
+def saveSymMatrix(matrix, file, items=None, saveItems=False):
+    fn = open(file + ".dst", 'w')
+    fn.write("%d labeled\n" % matrix.dim)
+    items = items if items else matrix.items
+    for i in range(matrix.dim):
+        fn.write("%s" % items[i]['attributes'])
+        for j in range(i + 1):
+            fn.write("\t%.6f" % matrix[i, j])
+        fn.write("\n")
+
+    fn.close()
+    if saveItems:
+        items.save(file + ".tab")
+
+
+
+def loadModel(fn):
+    if os.path.exists('%s.npy' % fn):
+        matrix, _labels, data = readMatrix('%s.npy' % fn)
+    elif os.path.exists("%s-prob.dst" % fn):
+        matrix, _labels, data = readMatrix("%s-prob.dst" % fn)
+    elif os.path.exists("%s.dst" % fn):
+        matrix, _labels, data = readMatrix("%s.dst" % fn)
+    else:
+        return None
+
+    if os.path.exists("%s.tab" % fn):
+        data = data.Table("%s.tab" % fn)
+        matrix.items = data
+    else:
+        print "ExampleTable %s not found!\n" % ("%s.tab" % fn)
+    if os.path.exists("%s.res" % fn):
+        matrix.results = pickle.load(open("%s.res" % fn, 'rb'))
+    else:
+        print "Results pickle %s not found!\n" % ("%s.res" % fn)
+
+    return matrix
+
+def saveModel(smx, fn):
+    saveSymMatrix(smx, "%s" % fn, smx.items)
+    smx.items.save('%s.tab' % fn)
+    pickle.dump(smx.results, open('%s.res' % fn, "wb"))
+
+
+
+def evaluateProjections(vizr, attributeList):
+    vizr.evaluatedProjectionsCount = 0
+    vizr.optimizedProjectionsCount = 0
+    vizr.evaluationData = {}            # clear all previous data about tested permutations and stuff
+    vizr.evaluationData["triedCombinations"] = {}
+    vizr.clearResults()
+
+    vizr.clearArguments()
+
+    if vizr.projOptimizationMethod != 0:
+        vizr.freeviz.useGeneralizedEigenvectors = 1
+        vizr.graph.normalizeExamples = 0
+
+    domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(vizr.graph.dataDomain.classVar.name, values=getVariableValuesSorted(vizr.graph.dataDomain.classVar))])
+    classListFull = vizr.graph.originalData[vizr.graph.dataClassIndex]
+
+    for attributes in attributeList:
+        attrIndices = [vizr.graph.attributeNameIndex[attr] for attr in attributes]
+        #print attrIndices
+        if vizr.projOptimizationMethod != 0:
+            projections = vizr.freeviz.findProjection(vizr.projOptimizationMethod, attrIndices, setAnchors=0, percentDataUsed=vizr.percentDataUsed)
+            if projections != None:
+                xanchors, yanchors, (attrNames, newIndices) = projections
+                table = vizr.graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=xanchors, YAnchors=yanchors)
+
+            if table == None or len(table) < vizr.minNumOfExamples: continue
+            accuracy, other_results = vizr.evaluateProjection(table)
+            generalDict = {"XAnchors": list(xanchors), "YAnchors": list(yanchors), "Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {"XAnchors": list(xanchors), "YAnchors": list(yanchors)}
+            vizr.addResult(accuracy, other_results, len(table), attrNames, vizr.evaluatedProjectionsCount, generalDict=generalDict)
+            vizr.evaluatedProjectionsCount += 1
+        else:
+            XAnchors = vizr.graph.createXAnchors(len(attrIndices))
+            YAnchors = vizr.graph.createYAnchors(len(attrIndices))
+            validData = vizr.graph.getValidList(attrIndices)
+            if numpy.sum(validData) >= vizr.minNumOfExamples:
+                classList = numpy.compress(validData, classListFull)
+                selectedData = numpy.compress(validData, numpy.take(vizr.graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+                sum_i = vizr.graph._getSum_i(selectedData)
+
+                table = vizr.graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+                accuracy, other_results = vizr.evaluateProjection(table)
+                generalDict = {"Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {}
+                vizr.addResult(accuracy, other_results, len(table), [vizr.graph.attributeNames[i] for i in attrIndices], vizr.evaluatedProjectionsCount, generalDict)
+                vizr.evaluatedProjectionsCount += 1
+
+    return vizr.evaluatedProjectionsCount
+import uuid
+
+from itertools import groupby
+from operator import itemgetter
+
+from Orange import data
+
+class Model(object):
+
+    def __init__(self, type_, classifier, probabilities, attributes, \
+                 instance_predictions=None, instance_classes=None, \
+                 name=None, XAnchors=None, YAnchors=None):
+        """Meta-model, a node in Model Map."""
+
+        self.uuid = uuid.uuid4().hex
+        self.type = type_
+        self.classifier = classifier
+        self.probabilities = probabilities
+        self.attributes = attributes
+        self.instance_predictions = instance_predictions
+        self.instance_classes = instance_classes
+        self.name = name if name is not None else type
+        self.XAnchors = XAnchors
+        self.YAnchors = YAnchors
+
+    def get_instance(self, domain):
+        """Return an :obj:`Orange.data.Table` instance with model meta-data.
+        
+        :param domain: instance will match given domain 
+        :type domain: :obj:`Orange.data.Domain`
+        """
+
+        inst = data.Instance(domain)
+
+        inst['uuid'] = self.uuid
+        inst['number of attributes'] = len(self.attributes)
+        results = [p == c for p, c in zip(self.instance_predictions, self.instance_classes)]
+        inst['CA'] = sum(results) / float(len(results))
+        inst['type'] = self.type
+        inst['model'] = self
+        inst['attributes'] = ', '.join(self.attributes)
+        #ex["AUC"] = nets[i].items[m]["AUC"].value
+        resultsByClass = sorted([(p == c, c) for p, c in zip(self.instance_predictions, self.instance_classes)], key=itemgetter(1))
+        groups = []
+        for _k, g in groupby(resultsByClass, lambda x: x[1].value):
+            resultsByClass, _classes = zip(*g)
+            groups.append(resultsByClass)
+        inst["CA by class"] = ', '.join([str(sum(results) / float(len(results))) for results in groups])
+        #ex["cluster CA"] = best_indices[i][j]
+        #ex["cluster size"] = median_csizes[i][j]
+        inst["label"] = self.name
+
+        return inst
+import itertools, math, random, os.path, time, uuid
+import cPickle as pickle
+
+import scipy.stats
+import numpy as np
+
+import orngVizRank as vr
+
+from operator import itemgetter
+from orngScaleData import getVariableValuesSorted
+from model import Model
+
+from Orange import data, distance, feature, ensemble
+from Orange.classification.knn import kNNLearner
+from Orange.classification.tree import TreeLearner
+
+MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM"]
+
+def distance_class(m1, m2):
+    w = np.average(m1.instance_predictions != m2.instance_predictions)
+    return 1 if math.isnan(w) else w
+
+def distance_prob(m1, m2):
+    ninstances = len(m1.probabilities)
+    normalization_factor = 2 * ninstances
+
+    return sum([np.sum(np.power(p1 - p2, 2)) for \
+                        (p1, p2) in zip(m1.probabilities, \
+                           m2.probabilities)]) / normalization_factor
+
+def distance_rank(m1, m2):
+    ninstances = len(m1.probabilities)
+
+    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
+    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
+    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
+    w = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
+                        (p1, p2) in zip(m1.probabilities,
+                           m2.probabilities)]) / ninstances)
+    return w
+
+def get_feature_subsets_scatterplot(domain, nsubsets):
+    """Return attribute subsets for Scatter Plot."""
+    attrs = []
+    for i in range(len(domain.features)):
+        for j in range(i):
+            attrs.append((domain.features[i].name, domain.features[j].name))
+    random.shuffle(attrs)
+
+    if nsubsets > len(attrs):
+        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % len(attrs))
+
+    return attrs[:nsubsets]
+
+def get_feature_subsets(domain, nsubsets):
+    """Return random attribute subsets.
+    
+    :param domain: data set domain to extract features
+    :type domain: :obj:`Orange.data.Domain`
+    
+    :param nsubsets:  number of attribute subsets
+    :type nsubsets: int
+    """
+
+    def binomial(n, k):
+        if n > k:
+            return math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
+        elif n == k:
+            return 1
+        else:
+            return 0
+
+    attrs = [var.name for var in domain.features]
+    nattrs = len(attrs)
+    total = sum(binomial(nattrs, i) for i in range(2, nattrs))
+
+    if nsubsets > total:
+        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % total)
+
+    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(2, nattrs))))
+    selectors = [1] * nsubsets + [0] * (total - nsubsets)
+    random.shuffle(selectors)
+    return list(itertools.compress(combinations, selectors))
+
+def get_models_table():
+    """Return an empty data table for model meta data."""
+
+    attrs = []
+    attrs.append(feature.String("uuid"))
+    varAttrs = feature.Continuous("number of attributes")
+    varAttrs.numberOfDecimals = 0
+    attrs.append(varAttrs)
+    attrs.append(feature.Continuous("CA"))
+    attrs.append(feature.Continuous("AUC"))
+    attrs.append(feature.String("CA by class"))
+    attrs.append(feature.Continuous("cluster CA"))
+    attrs.append(feature.String("label"))
+    attrs.append(feature.String("attributes"))
+    attrs.append(feature.Discrete("type", values=MODEL_LIST[1:]))
+    attrs.append(feature.Python("model"))
+    csizes = feature.Continuous("cluster size")
+    csizes.numberOfDecimals = 0
+    attrs.append(csizes)
+
+    return data.Table(data.Domain(attrs, 0))
+
+class BuildModelMap(object):
+
+    def __init__(self, fname, folds=10, model_limit=500):
+        self.folds = folds
+        self.model_limit = model_limit
+        self.data_d = self.get_data(fname)
+        self.data_c = self.get_data(fname, continuize=True)
+        self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)
+
+    def get_data(self, fname, continuize=False):
+        """Return a data Table.
+           
+        :param fname: data set file name
+        :type fname: string
+        
+        :param continuize:  if true, it tries to load a name-c.tab data table as Orange DomainContinuizer changes attribute names.
+        :type continuize: bool
+        
+        """
+
+        if continuize:
+            base, ext = os.path.splitext(fname)
+            fname = "%s-c%s" % (base, ext)
+
+            table = data.Table(fname)
+            return table
+            ##############################################################################
+            ## preprocess Data set
+#            transformer = data.continuization.DomainContinuizer()
+#            transformer.multinomialTreatment = data.continuization.DomainContinuizer.NValues
+#            transformer.continuousTreatment = data.continuization.DomainContinuizer.NormalizeBySpan
+#            transformer.classTreatment = data.continuization.DomainContinuizer.Ignore
+#            table = table.translate(transformer(table))
+#            return feature.imputation.AverageConstructor(table)(table)
+        else:
+            return data.Table(fname)
+
+
+    def build_model(self, learner, data):
+        """Build a classification meta-model.
+        
+        :param learner: classification learner to wrap
+        :type learner: :obj:`Orange.classification.Learner`
+        
+        :param data: data set
+        :type data: :obj:`Orange.data.Table`
+        
+        """
+
+        probabilities = []
+        instance_predictions = []
+        instance_classes = []
+        res = []
+        # estimate class probabilities using CV
+        for fold in range(self.folds):
+            learnset = data.selectref(self.indices, fold, negate=1)
+            testset = data.selectref(self.indices, fold, negate=0)
+            classifier = learner(learnset)
+            tcn = 0
+            for i in range(len(data)):
+                if (self.indices[i] == fold):
+                    ex = data.Instance(testset[tcn])
+                    ex.setclass("?")
+
+                    cr = classifier(ex, classifier.GetBoth)
+                    if cr[0].isSpecial():
+                        raise "Classifier %s returned unknown value" % (classifier.name)
+
+                    probabilities.append(np.array(list(cr[1])))
+                    instance_predictions.append(cr[0])
+                    instance_classes.append(testset[tcn].get_class())
+                    tcn += 1
+
+        return Model(type(learner).__name__,
+                     learner(data),
+                     probabilities,
+                     [x.name for x in data.domain.attributes],
+                     instance_predictions,
+                     instance_classes)
+
+    def build_projection_model(self, attributes, visualizationMethod=vr.LINEAR_PROJECTION):
+        """Build a projection meta-model."""
+
+        method = "?"
+        if visualizationMethod == vr.SCATTERPLOT:
+            import orngScaleScatterPlotData
+            graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
+            method = "SCATTERPLOT"
+        elif visualizationMethod == vr.RADVIZ:
+            import orngScaleLinProjData
+            graph = orngScaleLinProjData.orngScaleLinProjData()
+            graph.normalizeExamples = 1
+            method = "RADVIZ"
+        elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
+            import orngScaleLinProjData
+            from orngLinProj import FreeViz
+            graph = orngScaleLinProjData.orngScaleLinProjData()
+            graph.normalizeExamples = 0
+            method = "SPCA"
+        elif visualizationMethod == vr.POLYVIZ:
+            import orngScalePolyvizData
+            graph = orngScalePolyvizData.orngScalePolyvizData()
+            graph.normalizeExamples = 1
+            method = "POLYVIZ"
+        else:
+            print "an invalid visualization method was specified. VizRank can not run."
+            return
+
+        graph.setData(self.data_c, graph.rawSubsetData)
+        attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
+        domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(graph.dataDomain.class_var.name, values=getVariableValuesSorted(graph.dataDomain.class_var))])
+        classListFull = graph.originalData[graph.dataClassIndex]
+        table = None
+
+        if visualizationMethod == vr.LINEAR_PROJECTION:
+            freeviz = FreeViz(graph)
+            projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
+            if projections != None:
+                XAnchors, YAnchors, (attrNames, newIndices) = projections
+                table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
+            else:
+                print 'a null projection found'
+        elif visualizationMethod == vr.SCATTERPLOT:
+            XAnchors = YAnchors = None
+            table = graph.createProjectionAsExampleTable(attrIndices)
+        else:
+            XAnchors = graph.createXAnchors(len(attrIndices))
+            YAnchors = graph.createYAnchors(len(attrIndices))
+            validData = graph.getValidList(attrIndices)
+            # more than min number of examples
+            if np.sum(validData) >= 10:
+                classList = np.compress(validData, classListFull)
+                selectedData = np.compress(validData, np.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+                sum_i = graph._getSum_i(selectedData)
+                table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+
+        if not table: return None
+
+        probabilities = []
+        instance_predictions = []
+        instance_classes = []
+        learner = kNNLearner(k=10, rankWeight=0, distanceConstructor=distance.Euclidean(normalize=0))
+        for fold in range(self.folds):
+            learnset = table.selectref(self.indices, fold, negate=1)
+            testset = table.selectref(self.indices, fold, negate=0)
+            classifier = learner(learnset)
+            tcn = 0
+            for i in range(len(table)):
+                if (self.indices[i] == fold):
+                    ex = data.Instance(testset[tcn])
+                    ex.setclass("?")
+
+                    cr = classifier(ex, classifier.GetBoth)
+                    if cr[0].isSpecial():
+                        raise "Classifier %s returned unknown value" % (classifier.name)
+                    probabilities.append(np.array(list(cr[1])))
+                    instance_predictions.append(cr[0])
+                    instance_classes.append(testset[tcn].get_class())
+                    tcn += 1
+
+        return Model(method,
+                     learner(table),
+                     probabilities,
+                     attributes,
+                     instance_predictions,
+                     instance_classes,
+                     XAnchors=XAnchors,
+                     YAnchors=YAnchors)
+
+    def build_rf_models(self, data):
+        probabilities = [[] for fold in self.folds]
+
+        # estimate class probabilities using CV
+        for fold in range(self.folds):
+            learnset = data.selectref(indices, fold, negate=1)
+            testset = data.selectref(indices, fold, negate=0)
+
+            tree = TreeLearner(storeNodeClassifier=1,
+                       storeContingencies=0, storeDistributions=1, minExamples=5,
+                       storeExamples=1).instance()
+            gini = feature.scoring.Gini()
+            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+            tree.maxDepth = 4
+            tree.split = ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
+            forestLearner = ensemble.forest.RandomForestLearner(learner=tree, trees=self.model_limit)
+            forestClassifier = forestLearner(learnset)
+
+            for classifier in forestClassifier.classifiers:
+                tcn = 0
+                for i in range(len(data)):
+                    if (indices[i] == fold):
+                        ex = data.Instance(testset[tcn])
+                        ex.setclass("?")
+                        tcn += 1
+                        cr = classifier(ex, classifier.GetBoth)
+                        if cr[0].isSpecial():
+                            raise "Classifier %s returned unknown value" % (classifier.name)
+                        probabilities.append(cr)
+        model_classifier = learner(data)
+        model_classifier.probabilities = probabilities
+
+
+    def _print_time(self, time_start, iter, numiter):
+        if iter % 10000 == 0:
+            time_elapsed = time.time() - time_start
+            time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
+            time_remainng = int(time_total - time_elapsed)
+            print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+
+    def build_model_matrix(self, models, dist=distance_class):
+        """Build a distance matrix of models given the distance measure."""
+
+        dim = len(models)
+        print "%d models to matrix -- rank" % dim
+        smx = np.zeros(shape=(dim, dim))
+
+        counter = 0
+        time_start = time.time()
+        for i in range(dim):
+            for j in range(i):
+                smx[i, j] = dist(models[i], models[j])
+                counter += 1
+                self._print_time(time_start, counter, dim)
+
+        return smx
+
+    def build_model_data(self, models):
+        """Return an :obj:`Orange.data.Table` of model meta-data."""
+
+        table = get_models_table()
+        table.extend([model.get_instance(table.domain) for model in models])
+        return table
+
+    def save_models(self, models, smx, fn):
+        """Save models to disk."""
+
+        print 'saving matrix'
+#        if type(smx) == type([]):
+#            for s, title in smx:
+#                np.save('%s-%s' % (fn, title), s)
+#        else:
+#            np.save('%s' % (fn), smx)
+#
+#        print 'build out data'
+#
+#        print 'saving out data'
+#        out.save('%s.tab' % (fn))
+#        print 'saving models'
+#        pickle.dump(dict(zip(uuids, models)), open('%s.res' % (fn), "wb"))
+
+