Commits

Miha Stajdohar committed fa20699

Model Map repo.

  • Participants

Comments (0)

Files changed (25)

+import pickle
+
+from tools import *
+
+mergedFile = ROOT + "dst/breast-allmodels-468"
+modelFile = ROOT + "dst/breast-svms-510"
+model_ratio = 0.0
+model_knn = 4
+
+merged_res = pickle.load(open("%s.res" % mergedFile, 'rb'))
+merged_smx, merged_labels, merged_data = OWDistanceFile.readMatrix("%s.dst" % mergedFile)
+merged_smx.items = orange.ExampleTable("%s.tab" % mergedFile)
+
+model_res = pickle.load(open("%s.res" % modelFile, 'rb'))
+model_smx, model_labels, model_data = OWDistanceFile.readMatrix("%s.dst" % modelFile)
+model_smx.items = orange.ExampleTable("%s.tab" % modelFile)
+model_net = matrix2network(model_smx, model_ratio, model_knn)
+model_net.items = merged_smx.items
+
+median_matrix, medians, csizes, bests = cluster2matrix(model_net, model_smx)
+
+for j, m in enumerate(medians):
+    #vizrs[i][m][5]['Method'] = methods[i]
+    #models.append(methods[i])
+    
+    ex = orange.Example(merged_smx.items.domain)
+    ex["uuid"] = nets[i].items[m]["uuid"].value
+    ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
+    ex["CA"] = nets[i].items[m]["CA"].value
+    ex["AUC"] = nets[i].items[m]["AUC"].value
+    ex["cluster CA"] = best_indices[i][j]
+    ex["attributes"] = nets[i].items[m]["attributes"].value
+    ex["model"] = nets[i].items[m]["model"].value
+    ex["cluster size"] = median_csizes[i][j]
+    merged_smx.items.append(ex)
+    
+merged_vizr_res.extend([vizrs[i][m] for m in medians])
+merged_proj_points.extend([projs_points[i][m] for m in medians])
+results.extend([vizrs[i][m][5].get("Results").results for m in medians])

archive/metamining.py

+import os
+import os.path
+import gc
+import pickle
+import random
+
+import orngNetwork
+import OWDistanceFile
+
+from tools import *
+from projections2matrix import *
+from matrix2network import *
+from classifier2matrix import *
+
+LIMIT = 1000
+PROJECTION_LIMIT = LIMIT
+TREE_LIMIT = LIMIT
+TREE_FILE = "breast-tree"
+#TREE_FILE = "zoo-tree"
+
+#ROOT_DATA = "C:\\Python26\\Lib\\site-packages\\orange\\doc\\datasets\\"
+#ROOT = "C:\\Users\\miha\\Projects\\res\\metamining\\"
+#ROOT_DATA = "/home/miha/metamining/"
+#ROOT = "/home/miha/metamining/"
+#OUT_FILE = ROOT + "dst/breast"
+#OUT_FILE = ROOT + "dst/zoo"
+
+#data = getData(ROOT_DATA + "primary-tumor.tab", 1)
+#data_c = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+#data_d = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/zoo.tab")
+#
+input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
+         (vr.RADVIZ           , "radviz"     , 0.00, 1),
+         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 
+
+#input_classifier = [("knn"  , 0.00, 1)]
+#                    ("bayes", 0.00, 1),
+#                    ("tree" , 0.00, 1)]
+##############################################################################
+# BREAST: READ LIMIT 100
+##############################################################################
+#input = [(ROOT + "dst/breast-scatterplot-36", 0.00, 0),
+#         (ROOT + "dst/breast-linproj-101"   , 0.00, 2),
+#         (ROOT + "dst/breast-radviz-101"    , 0.00, 2),
+#         (ROOT + "dst/breast-polyviz-101"   , 0.00, 2)] 
+#
+#input_classifier = [(ROOT + "dst/breast-tree-110" , 0.00, 1),
+#                    (ROOT + "dst/breast-knn-110"  , 0.00, 2),
+#                    (ROOT + "dst/breast-bayes-110", 0.00, 2)]
+#                    (ROOT + "dst/breast-svms-510" , 0.00, 4)]
+##############################################################################
+# BREAST: READ LIMIT 500
+##############################################################################
+#input = [(ROOT + "dst/breast-scatterplot-36", 0.00, 0),
+#         (ROOT + "dst/breast-linproj-501"   , 0.00, 2),
+#         (ROOT + "dst/breast-radviz-501"    , 0.00, 2),
+#         (ROOT + "dst/breast-polyviz-501"   , 0.00, 2)] 
+#
+#input_classifier = [(ROOT + "dst/breast-tree-500" , 0.00, 1),
+#                    (ROOT + "dst/breast-knn-510"  , 0.00, 2),
+#                    (ROOT + "dst/breast-bayes-510", 0.00, 2),
+#                    (ROOT + "dst/breast-svms-510" , 0.00, 4)]
+##############################################################################
+# ZOO: READ LIMIT 100
+##############################################################################
+input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
+         (vr.RADVIZ           , "radviz"     , 0.00, 1),
+         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 
+
+input_classifier = [("tree" , 0.00, 1),
+                    ("bayes", 0.00, 1),
+                    ("knn"  , 0.00, 1)]
+
+#input = [("dst/zoo-scatterplot-120", 0.00, 1),
+#         ("dst/zoo-linproj-93"     , 0.00, 1),
+#         ("dst/zoo-radviz-100"     , 0.00, 1),
+#         ("dst/zoo-polyviz-100"    , 0.00, 1)] 
+#
+#input_classifier = [("dst/zoo-tree-100" , 0.00, 1),
+#                    ("dst/zoo-bayes-192", 0.00, 1),
+#                    ("dst/zoo-knn-192"  , 0.00, 1)]
+##############################################################################
+# ZOO: READ LIMIT 500
+##############################################################################
+#input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+#         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 2),
+#         (vr.RADVIZ           , "radviz"     , 0.00, 4),
+#         (vr.POLYVIZ          , "polyviz"    , 0.00, 4)] 
+#input_classifier = [("tree" , 0.00, 2),
+#                    ("bayes", 0.00, 4),
+#                    ("knn"  , 0.00, 2)]
+#input = [("dst/zoo-scatterplot-120", 0.00, 1),
+#         ("dst/zoo-linproj-453"    , 0.00, 2),
+#         ("dst/zoo-radviz-500"     , 0.00, 4),
+#         ("dst/zoo-polyviz-500"    , 0.00, 4)] 
+#input_classifier = [("dst/zoo-tree-500" , 0.00, 2),
+#                    ("dst/zoo-bayes-573", 0.00, 4),
+#                    ("dst/zoo-knn-573"  , 0.00, 2)]
+##############################################################################
+# PRIMARY: READ LIMIT 500
+##############################################################################
+#input = [(ROOT + "dst/primary-scatterplot-153", 0.00, 1),
+#         (ROOT + "dst/primary-linproj-377"    , 0.00, 2),
+#         (ROOT + "dst/primary-radviz-500"     , 0.00, 4),
+#         (ROOT + "dst/primary-polyviz-500"    , 0.00, 4)]
+#input_classifier = [("dst/primary-tree-500" , 0.00, 2),
+#                    ("bayes", 0.00, 2),
+#                    ("knn"  , 0.00, 2)]
+##############################################################################
+
+def getProjections(input, attributes=None):
+    vizrs = []
+    smxs = []
+    nets = []    
+    projs_points = []
+    methods = []
+    uuid_results = {}
+    for inp in input:
+        if type(inp[0]) == type(""):
+            fn, ratio, knn = inp
+            print "READING: %s" % fn            
+            pkl_file = open(fn + ".res", "rb")
+            res = pickle.load(pkl_file)
+            pkl_file.close()
+            matrix, labels, data = OWDistanceFile.readMatrix(fn + ".dst")
+            matrix.items = orange.ExampleTable(fn + ".tab")
+            net = matrix2network(matrix, ratio, knn)
+            net.items = matrix.items
+            vizrs.append([res[ex['uuid'].value][1] for ex in net.items])
+            smxs.append(matrix)
+            nets.append(net)
+            projs_points.append([res[ex['uuid'].value][2] for ex in net.items])
+            methods.append([res[ex['uuid'].value][0] for ex in net.items])
+            uuid_results.update(res)
+                
+        else:
+            method, name, ratio, knn = inp
+            vizr, smx_knn_predict, proj_points = metamining(data_c, method, name, PROJECTION_LIMIT, OUT_FILE, attributes)
+            net = matrix2network(smx_knn_predict, ratio, knn)
+            net.items = smx_knn_predict.items
+            vizrs.append(vizr.results)
+            smxs.append(smx_knn_predict)
+            nets.append(net)
+            projs_points.append(proj_points)
+            methods.append(method)
+            attrs = [ex['attributes'].value.split(', ') for ex in net.items]
+            uuid_results.update(dict(zip([ex["uuid"].value for ex in net.items], \
+                                zip([method for ex in net.items], [r for r in vizr.results], \
+                                    proj_points, [None for ex in net.items], attrs))))
+          
+    return vizrs, nets, smxs, projs_points, methods, uuid_results
+
+def getClassifiers(input, attributes=[]):
+    results = {}
+    smxs = []
+    nets = []
+    for s, ratio, knn in input:
+        classifiers = None
+        if s.upper() == "TREE":
+            print "TREE: calculating classifiers"
+            method = 5
+            tree = orngTree.TreeLearner(storeNodeClassifier = 1, storeContingencies=0, \
+              storeDistributions=1, minExamples=5, storeExamples=1).instance()
+            gini = orange.MeasureAttribute_gini()
+            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+            tree.maxDepth = 4
+            tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)
+            forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
+            forest = forestLearner(data_d)
+            classifiers = forest.classifiers
+            outFile = OUT_FILE + "-tree"
+            
+        if s.upper() == "BAYES":
+            if len(attributes) == 0: continue
+            print "BAYES: calculating classifiers"
+            method = 6
+            
+            classifiers = []
+            for atts in attributes:
+                exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+                data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+                classifiers.append(orange.BayesLearner(data2))
+            outFile = OUT_FILE + "-bayes"
+            
+        if s.upper() == "KNN":
+            if len(attributes) == 0: continue
+            print "kNN: calculating classifiers"
+            method = 7
+            
+            classifiers = []
+            for atts in attributes:
+                exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+                data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+                classifiers.append(orange.kNNLearner(data2, k=math.sqrt(len(data2))|1))
+            outFile = OUT_FILE + "-knn"
+            
+        if classifiers:
+            smx, RV = classifier2matrix(data_d, method, classifiers, outFile)
+            methods, uuids, res, projections, classifiers, attrs = RV
+            net = matrix2network(smx, ratio, knn)
+            net.items = smx.items
+            
+            results.update(dict(zip(uuids, zip(methods, res, projections, classifiers, attrs))))
+            smxs.append(smx)
+            nets.append(net)
+
+        if os.path.exists("%s.res" % s):
+            print "READING: %s" % s
+            res = pickle.load(open("%s.res" % s, 'rb'))
+            smx, labels, data = OWDistanceFile.readMatrix("%s.dst" % s)
+            smx.items = orange.ExampleTable("%s.tab" % s)
+            net = matrix2network(smx, ratio, knn)
+            net.items = smx.items
+            
+            results.update(res)
+            smxs.append(smx)
+            nets.append(net)
+            
+    return nets, smxs, results
+    
+def getClusters(nets, smxs):
+    median_matrices = []
+    median_indices = []
+    median_csizes = []
+    best_indices = []
+    for net, smx in zip(nets, smxs):    
+        median_matrix, medians, csizes, bests = cluster2matrix(net, smx)
+        median_matrices.append(median_matrix)
+        median_indices.append(medians)
+        median_csizes.append(csizes)
+        best_indices.append(bests)
+    
+    return median_matrices, median_indices, median_csizes, best_indices
+
+out = getModelsExampleTable()
+
+##############################################################################
+# GET PROJECTIONS
+##############################################################################
+##############################################################################
+# GENERATE ATTRIBUTES FROM PROJECTIONS
+#attrs = [u.split(';') for u in list(set([';'.join(sorted(v[4])) for v in res.values()]))]
+
+attributes = getRandomAttributeSubsets(data_d.domain, LIMIT)
+attributes = attributes + [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+
+attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-420.tab') if ex['model'].value != 'SCATTERPLOT']
+attributes = set(attributes)
+attributes = [attr.split(', ') for attr in attributes]
+
+vizrs, nets, smxs, projs_points, methods, res = getProjections(input, attributes)
+
+for smx in smxs:
+    print smx.dim
+    
+merged_vizr_res = []
+merged_proj_points = []
+probabilities = []
+results = []
+
+for i, vizr in enumerate(vizrs):
+    merged_vizr_res.extend(vizrs[i])
+    merged_proj_points.extend(projs_points[i])
+    results.extend([v[5].get("Results").results for v in vizrs[i]])
+    ex = orange.Example(out.domain)
+    for m in range(len(vizr)):
+        ex["uuid"] = nets[i].items[m]["uuid"].value
+        ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
+        ex["CA"] = nets[i].items[m]["CA"].value
+        #ex["AUC"] = nets[i].items[m]["AUC"].value
+        #ex["CA by class"] = nets[i].items[m]["CA by class"].value
+        #ex["cluster CA"] = best_indices[i][j]
+        ex["attributes"] = nets[i].items[m]["attributes"].value
+        ex["model"] = nets[i].items[m]["model"].value
+        #ex["cluster size"] = median_csizes[i][j]
+        out.append(ex)
+
+uuid_results = res
+#median_matrices, median_indices, median_csizes, best_indices = getClusters(nets, smxs)
+#print "projection results:", len(res)
+#merged_vizr_res = []
+#merged_proj_points = []
+#probabilities = []
+#results = []
+#models = []
+#for i, medians in enumerate(median_indices):    
+#    for j, m in enumerate(medians):
+#        vizrs[i][m][5]['Method'] = methods[i]
+#        models.append(methods[i])
+#        ex = orange.Example(out.domain)
+#        ex["uuid"] = nets[i].items[m]["uuid"].value
+#        ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
+#        ex["CA"] = nets[i].items[m]["CA"].value
+#        ex["AUC"] = nets[i].items[m]["AUC"].value
+#        ex["CA by class"] = nets[i].items[m]["CA by class"].value
+#        ex["cluster CA"] = best_indices[i][j]
+#        ex["attributes"] = nets[i].items[m]["attributes"].value
+#        ex["model"] = nets[i].items[m]["model"].value
+#        ex["cluster size"] = median_csizes[i][j]
+#        out.append(ex)
+#        
+#    merged_vizr_res.extend([vizrs[i][m] for m in medians])
+#    merged_proj_points.extend([projs_points[i][m] for m in medians])
+#    results.extend([vizrs[i][m][5].get("Results").results for m in medians])
+
+##############################################################################
+# GET CLASSIFIERS
+##############################################################################
+#attributes = attributes + [[var.name] for var in data_d.domain if var != data_d.domain.classVar]
+#
+#nets, smxs, uuid_results = getClassifiers(input_classifier, attributes)
+#median_matrices, median_indices, median_csizes, best_indices = getClusters(nets, smxs)
+#
+#print "process classifiers"
+#uuid_results.update(res)
+#print "merged results:", len(uuid_results)
+#for i in range(len(nets)):
+#    items = nets[i].items
+#    median_matrix = median_matrices[i]
+#    medians = median_indices[i]
+#    for j, m in enumerate(medians):
+#        out_ex = orange.Example(out.domain)
+#        out_ex["uuid"] = items[m]["uuid"].value
+#        out_ex["number of attributes"] = items[m]["number of attributes"].value
+#        out_ex["CA"] = items[m]["CA"].value
+#        out_ex["AUC"] = items[m]["AUC"].value
+#        out_ex["CA by class"] = items[m]["CA by class"].value
+#        out_ex["cluster CA"] = best_indices[i][j]
+#        out_ex["attributes"] = items[m]["attributes"].value
+#        out_ex["model"] = items[m]["model"].value
+#        out_ex["cluster size"] = median_csizes[i][j]
+#        out.append(out_ex)
+#        
+#    results.extend([uuid_results[items[m]["uuid"].value][1] for m in medians])
+#    models.extend([items[m]["model"].value for m in medians])
+#
+smx_class, smx_prob = models2matrix(results)
+
+print "saving", '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim)
+saveSymMatrix(smx_prob, '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim), out)
+out.save('%s-allmodels-%d.tab' % (OUT_FILE, smx_prob.dim))
+nr = {}
+for ex in out:
+    uuid = ex['uuid'].value 
+    if uuid in uuid_results:
+        nr[uuid] = uuid_results[uuid]
+    else:
+        print uuid
+pickle.dump(nr, open('%s-allmodels-%d.res' % (OUT_FILE, smx_prob.dim), "wb"))
+pickle.dump(uuid_results, open('%s-allmodels-%d-all.res' % (OUT_FILE, smx_prob.dim), "wb"))

archive/metamining.py-bad

+import os
+import os.path
+import gc
+import pickle
+import random
+
+import orngNetwork
+import OWDistanceFile
+
+from tools import *
+from projections2matrix import *
+from matrix2network import *
+from classifier2matrix import *
+
+LIMIT = 1000
+PROJECTION_LIMIT = LIMIT
+TREE_LIMIT = LIMIT
+TREE_FILE = "breast-tree"
+#TREE_FILE = "zoo-tree"
+
+#ROOT_DATA = "C:\\Python26\\Lib\\site-packages\\orange\\doc\\datasets\\"
+#ROOT = "C:\\Users\\miha\\Projects\\res\\metamining\\"
+#ROOT_DATA = "/home/miha/metamining/"
+#ROOT = "/home/miha/metamining/"
+#OUT_FILE = ROOT + "dst/breast"
+#OUT_FILE = ROOT + "dst/zoo"
+
+#data = getData(ROOT_DATA + "primary-tumor.tab", 1)
+#data_c = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+#data_d = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/zoo.tab")
+#
+input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
+         (vr.RADVIZ           , "radviz"     , 0.00, 1),
+         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 
+
+#input_classifier = [("knn"  , 0.00, 1)]
+#                    ("bayes", 0.00, 1),
+#                    ("tree" , 0.00, 1)]
+##############################################################################
+# BREAST: READ LIMIT 100
+##############################################################################
+#input = [(ROOT + "dst/breast-scatterplot-36", 0.00, 0),
+#         (ROOT + "dst/breast-linproj-101"   , 0.00, 2),
+#         (ROOT + "dst/breast-radviz-101"    , 0.00, 2),
+#         (ROOT + "dst/breast-polyviz-101"   , 0.00, 2)] 
+#
+#input_classifier = [(ROOT + "dst/breast-tree-110" , 0.00, 1),
+#                    (ROOT + "dst/breast-knn-110"  , 0.00, 2),
+#                    (ROOT + "dst/breast-bayes-110", 0.00, 2)]
+#                    (ROOT + "dst/breast-svms-510" , 0.00, 4)]
+##############################################################################
+# BREAST: READ LIMIT 500
+##############################################################################
+#input = [(ROOT + "dst/breast-scatterplot-36", 0.00, 0),
+#         (ROOT + "dst/breast-linproj-501"   , 0.00, 2),
+#         (ROOT + "dst/breast-radviz-501"    , 0.00, 2),
+#         (ROOT + "dst/breast-polyviz-501"   , 0.00, 2)] 
+#
+#input_classifier = [(ROOT + "dst/breast-tree-500" , 0.00, 1),
+#                    (ROOT + "dst/breast-knn-510"  , 0.00, 2),
+#                    (ROOT + "dst/breast-bayes-510", 0.00, 2),
+#                    (ROOT + "dst/breast-svms-510" , 0.00, 4)]
+##############################################################################
+# ZOO: READ LIMIT 100
+##############################################################################
+input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
+         (vr.RADVIZ           , "radviz"     , 0.00, 1),
+         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 
+
+input_classifier = [("tree" , 0.00, 1),
+                    ("bayes", 0.00, 1),
+                    ("knn"  , 0.00, 1)]
+
+#input = [("dst/zoo-scatterplot-120", 0.00, 1),
+#         ("dst/zoo-linproj-93"     , 0.00, 1),
+#         ("dst/zoo-radviz-100"     , 0.00, 1),
+#         ("dst/zoo-polyviz-100"    , 0.00, 1)] 
+#
+#input_classifier = [("dst/zoo-tree-100" , 0.00, 1),
+#                    ("dst/zoo-bayes-192", 0.00, 1),
+#                    ("dst/zoo-knn-192"  , 0.00, 1)]
+##############################################################################
+# ZOO: READ LIMIT 500
+##############################################################################
+#input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+#         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 2),
+#         (vr.RADVIZ           , "radviz"     , 0.00, 4),
+#         (vr.POLYVIZ          , "polyviz"    , 0.00, 4)] 
+#input_classifier = [("tree" , 0.00, 2),
+#                    ("bayes", 0.00, 4),
+#                    ("knn"  , 0.00, 2)]
+#input = [("dst/zoo-scatterplot-120", 0.00, 1),
+#         ("dst/zoo-linproj-453"    , 0.00, 2),
+#         ("dst/zoo-radviz-500"     , 0.00, 4),
+#         ("dst/zoo-polyviz-500"    , 0.00, 4)] 
+#input_classifier = [("dst/zoo-tree-500" , 0.00, 2),
+#                    ("dst/zoo-bayes-573", 0.00, 4),
+#                    ("dst/zoo-knn-573"  , 0.00, 2)]
+##############################################################################
+# PRIMARY: READ LIMIT 500
+##############################################################################
+#input = [(ROOT + "dst/primary-scatterplot-153", 0.00, 1),
+#         (ROOT + "dst/primary-linproj-377"    , 0.00, 2),
+#         (ROOT + "dst/primary-radviz-500"     , 0.00, 4),
+#         (ROOT + "dst/primary-polyviz-500"    , 0.00, 4)]
+#input_classifier = [("dst/primary-tree-500" , 0.00, 2),
+#                    ("bayes", 0.00, 2),
+#                    ("knn"  , 0.00, 2)]
+##############################################################################
+
+def getProjections(input, attributes=None):
+    vizrs = []
+    smxs = []
+    nets = []    
+    projs_points = []
+    methods = []
+    uuid_results = {}
+    for inp in input:
+        if type(inp[0]) == type(""):
+            fn, ratio, knn = inp
+            print "READING: %s" % fn            
+            pkl_file = open(fn + ".res", "rb")
+            res = pickle.load(pkl_file)
+            pkl_file.close()
+            matrix, labels, data = OWDistanceFile.readMatrix(fn + ".dst")
+            matrix.items = orange.ExampleTable(fn + ".tab")
+            net = matrix2network(matrix, ratio, knn)
+            net.items = matrix.items
+            vizrs.append([res[ex['uuid'].value][1] for ex in net.items])
+            smxs.append(matrix)
+            nets.append(net)
+            projs_points.append([res[ex['uuid'].value][2] for ex in net.items])
+            methods.append([res[ex['uuid'].value][0] for ex in net.items])
+            uuid_results.update(res)
+                
+        else:
+            method, name, ratio, knn = inp
+            vizr, smx_knn_predict, proj_points = metamining(data_c, method, name, PROJECTION_LIMIT, OUT_FILE, attributes)
+            net = matrix2network(smx_knn_predict, ratio, knn)
+            net.items = smx_knn_predict.items
+            vizrs.append(vizr.results)
+            smxs.append(smx_knn_predict)
+            nets.append(net)
+            projs_points.append(proj_points)
+            methods.append(method)
+            attrs = [ex['attributes'].value.split(', ') for ex in net.items]
+            uuid_results.update(dict(zip([ex["uuid"].value for ex in net.items], \
+                                zip([method for ex in net.items], [r for r in vizr.results], \
+                                    proj_points, [None for ex in net.items], attrs))))
+          
+    return vizrs, nets, smxs, projs_points, methods, uuid_results
+
+def getClassifiers(input, attributes=[]):
+    results = {}
+    smxs = []
+    nets = []
+    for s, ratio, knn in input:
+        classifiers = None
+        if s.upper() == "TREE":
+            print "TREE: calculating classifiers"
+            method = 5
+            tree = orngTree.TreeLearner(storeNodeClassifier = 1, storeContingencies=0, \
+              storeDistributions=1, minExamples=5, storeExamples=1).instance()
+            gini = orange.MeasureAttribute_gini()
+            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+            tree.maxDepth = 4
+            tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)
+            forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
+            forest = forestLearner(data_d)
+            classifiers = forest.classifiers
+            outFile = OUT_FILE + "-tree"
+            
+        if s.upper() == "BAYES":
+            if len(attributes) == 0: continue
+            print "BAYES: calculating classifiers"
+            method = 6
+            
+            classifiers = []
+            for atts in attributes:
+                exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+                data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+                classifiers.append(orange.BayesLearner(data2))
+            outFile = OUT_FILE + "-bayes"
+            
+        if s.upper() == "KNN":
+            if len(attributes) == 0: continue
+            print "kNN: calculating classifiers"
+            method = 7
+            
+            classifiers = []
+            for atts in attributes:
+                exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+                data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+                classifiers.append(orange.kNNLearner(data2, k=math.sqrt(len(data2))|1))
+            outFile = OUT_FILE + "-knn"
+            
+        if classifiers:
+            smx, RV = classifier2matrix(data_d, method, classifiers, outFile)
+            methods, uuids, res, projections, classifiers, attrs = RV
+            net = matrix2network(smx, ratio, knn)
+            net.items = smx.items
+            
+            results.update(dict(zip(uuids, zip(methods, res, projections, classifiers, attrs))))
+            smxs.append(smx)
+            nets.append(net)
+
+        if os.path.exists("%s.res" % s):
+            print "READING: %s" % s
+            res = pickle.load(open("%s.res" % s, 'rb'))
+            smx, labels, data = OWDistanceFile.readMatrix("%s.dst" % s)
+            smx.items = orange.ExampleTable("%s.tab" % s)
+            net = matrix2network(smx, ratio, knn)
+            net.items = smx.items
+            
+            results.update(res)
+            smxs.append(smx)
+            nets.append(net)
+            
+    return nets, smxs, results
+    
+def getClusters(nets, smxs):
+    median_matrices = []
+    median_indices = []
+    median_csizes = []
+    best_indices = []
+    for net, smx in zip(nets, smxs):    
+        median_matrix, medians, csizes, bests = cluster2matrix(net, smx)
+        median_matrices.append(median_matrix)
+        median_indices.append(medians)
+        median_csizes.append(csizes)
+        best_indices.append(bests)
+    
+    return median_matrices, median_indices, median_csizes, best_indices
+
+out = getModelsExampleTable()
+
+##############################################################################
+# GET PROJECTIONS
+##############################################################################
+##############################################################################
+# GENERATE ATTRIBUTES FROM PROJECTIONS
+#attrs = [u.split(';') for u in list(set([';'.join(sorted(v[4])) for v in res.values()]))]
+
+attributes = getRandomAttributeSubsets(data_d.domain, LIMIT)
+attributes = attributes + [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+
+attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-436.tab') if ex['model'].value != 'SCATTERPLOT']
+attributes = set(attributes)
+attributes = [attr.split(', ') for attr in attributes]
+
+vizrs, nets, smxs, projs_points, methods, res = getProjections(input, attributes)
+
+for smx in smxs:
+    print smx.dim
+    
+merged_vizr_res = []
+merged_proj_points = []
+probabilities = []
+results = []
+
+for i, vizr in enumerate(vizrs):
+    merged_vizr_res.extend(vizrs[i])
+    merged_proj_points.extend(projs_points[i])
+    results.extend([v[5].get("Results").results for v in vizrs[i]])
+    ex = orange.Example(out.domain)
+    for m in range(len(vizr)):
+        ex["uuid"] = nets[i].items[m]["uuid"].value
+        ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
+        ex["CA"] = nets[i].items[m]["CA"].value
+        #ex["AUC"] = nets[i].items[m]["AUC"].value
+        #ex["CA by class"] = nets[i].items[m]["CA by class"].value
+        #ex["cluster CA"] = best_indices[i][j]
+        ex["attributes"] = nets[i].items[m]["attributes"].value
+        ex["model"] = nets[i].items[m]["model"].value
+        #ex["cluster size"] = median_csizes[i][j]
+        out.append(ex)
+
+uuid_results = res
+#median_matrices, median_indices, median_csizes, best_indices = getClusters(nets, smxs)
+#print "projection results:", len(res)
+#merged_vizr_res = []
+#merged_proj_points = []
+#probabilities = []
+#results = []
+#models = []
+#for i, medians in enumerate(median_indices):    
+#    for j, m in enumerate(medians):
+#        vizrs[i][m][5]['Method'] = methods[i]
+#        models.append(methods[i])
+#        ex = orange.Example(out.domain)
+#        ex["uuid"] = nets[i].items[m]["uuid"].value
+#        ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
+#        ex["CA"] = nets[i].items[m]["CA"].value
+#        ex["AUC"] = nets[i].items[m]["AUC"].value
+#        ex["CA by class"] = nets[i].items[m]["CA by class"].value
+#        ex["cluster CA"] = best_indices[i][j]
+#        ex["attributes"] = nets[i].items[m]["attributes"].value
+#        ex["model"] = nets[i].items[m]["model"].value
+#        ex["cluster size"] = median_csizes[i][j]
+#        out.append(ex)
+#        
+#    merged_vizr_res.extend([vizrs[i][m] for m in medians])
+#    merged_proj_points.extend([projs_points[i][m] for m in medians])
+#    results.extend([vizrs[i][m][5].get("Results").results for m in medians])
+
+##############################################################################
+# GET CLASSIFIERS
+##############################################################################
+#attributes = attributes + [[var.name] for var in data_d.domain if var != data_d.domain.classVar]
+#
+#nets, smxs, uuid_results = getClassifiers(input_classifier, attributes)
+#median_matrices, median_indices, median_csizes, best_indices = getClusters(nets, smxs)
+#
+#print "process classifiers"
+#uuid_results.update(res)
+#print "merged results:", len(uuid_results)
+#for i in range(len(nets)):
+#    items = nets[i].items
+#    median_matrix = median_matrices[i]
+#    medians = median_indices[i]
+#    for j, m in enumerate(medians):
+#        out_ex = orange.Example(out.domain)
+#        out_ex["uuid"] = items[m]["uuid"].value
+#        out_ex["number of attributes"] = items[m]["number of attributes"].value
+#        out_ex["CA"] = items[m]["CA"].value
+#        out_ex["AUC"] = items[m]["AUC"].value
+#        out_ex["CA by class"] = items[m]["CA by class"].value
+#        out_ex["cluster CA"] = best_indices[i][j]
+#        out_ex["attributes"] = items[m]["attributes"].value
+#        out_ex["model"] = items[m]["model"].value
+#        out_ex["cluster size"] = median_csizes[i][j]
+#        out.append(out_ex)
+#        
+#    results.extend([uuid_results[items[m]["uuid"].value][1] for m in medians])
+#    models.extend([items[m]["model"].value for m in medians])
+#
+smx_class, smx_prob = models2matrix(results)
+
+print "saving", '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim)
+saveSymMatrix(smx_prob, '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim), out)
+out.save('%s-allmodels-%d.tab' % (OUT_FILE, smx_prob.dim))
+nr = {}
+for ex in out:
+    uuid = ex['uuid'].value 
+    if uuid in uuid_results:
+        nr[uuid] = uuid_results[uuid]
+    else:
+        print uuid
+pickle.dump(nr, open('%s-allmodels-%d.res' % (OUT_FILE, smx_prob.dim), "wb"))
+pickle.dump(uuid_results, open('%s-allmodels-%d-all.res' % (OUT_FILE, smx_prob.dim), "wb"))

archive/validation.py

+import os
+import os.path
+import gc
+import pickle
+import random
+
+import orngNetwork
+import OWDistanceFile
+
+from tools import *
+from projections2matrix import *
+from matrix2network import *
+from classifier2matrix import *
+
+LIMIT = 10
+PROJECTION_LIMIT = LIMIT
+
+data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/zoo.tab")
+
+input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
+         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
+         (vr.RADVIZ           , "radviz"     , 0.00, 1),
+         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 
+
+input_classifier = [("bayes", 0.00, 1),
+                    ("knn"  , 0.00, 1)]
+
+
+def getProjections(input, train, attributes=None):
+    vizrs = []
+    smxs = []
+    nets = []    
+    projs_points = []
+    methods = []
+    uuid_results = {}
+    for inp in input:
+        if type(inp[0]) == type(""):
+            fn, ratio, knn = inp
+            print "READING: %s" % fn            
+            pkl_file = open(fn + ".res", "rb")
+            res = pickle.load(pkl_file)
+            pkl_file.close()
+            matrix, labels, data = OWDistanceFile.readMatrix(fn + ".dst")
+            matrix.items = orange.ExampleTable(fn + ".tab")
+            net = matrix2network(matrix, ratio, knn)
+            net.items = matrix.items
+            vizrs.append([res[ex['uuid'].value][1] for ex in net.items])
+            smxs.append(matrix)
+            nets.append(net)
+            projs_points.append([res[ex['uuid'].value][2] for ex in net.items])
+            methods.append([res[ex['uuid'].value][0] for ex in net.items])
+            uuid_results.update(res)
+                
+        else:
+            method, name, ratio, knn = inp
+            vizr, smx_knn_predict, proj_points = metamining(train, method, name, PROJECTION_LIMIT, OUT_FILE, attributes)
+            net = matrix2network(smx_knn_predict, ratio, knn)
+            net.items = smx_knn_predict.items
+            vizrs.append(vizr.results)
+            smxs.append(smx_knn_predict)
+            nets.append(net)
+            projs_points.append(proj_points)
+            methods.append(method)
+            attrs = [ex['attributes'].value.split(', ') for ex in net.items]
+            uuid_results.update(dict(zip([ex["uuid"].value for ex in net.items], \
+                                zip([method for ex in net.items], [r for r in vizr.results], \
+                                    proj_points, [None for ex in net.items], attrs))))
+          
+    return vizrs, nets, smxs, projs_points, methods, uuid_results
+
+def getClassifiers(input, train, attributes=[]):
+    results = {}
+    smxs = []
+    nets = []
+    for s, ratio, knn in input:
+        classifiers = None
+        if s.upper() == "BAYES":
+            if len(attributes) == 0: continue
+            print "BAYES: calculating classifiers"
+            method = 6
+            
+            classifiers = []
+            for atts in attributes:
+                exclude = [att for att in train.domain if att.name not in atts + [train.domain.classVar.name]]
+                data2 = orange.Preprocessor_ignore(train, attributes = exclude)
+                classifiers.append(orange.BayesLearner(data2))
+            outFile = OUT_FILE + "-bayes"
+            
+        if s.upper() == "KNN":
+            if len(attributes) == 0: continue
+            print "kNN: calculating classifiers"
+            method = 7
+            
+            classifiers = []
+            for atts in attributes:
+                exclude = [att for att in train.domain if att.name not in atts + [train.domain.classVar.name]]
+                data2 = orange.Preprocessor_ignore(train, attributes = exclude)
+                classifiers.append(orange.kNNLearner(data2, k=int(math.sqrt(len(data2)))|1))
+            outFile = OUT_FILE + "-knn"
+            
+        if classifiers:
+            smx, RV = classifier2matrix(train, method, classifiers, outFile)
+            methods, uuids, res, projections, classifiers, attrs = RV
+            net = matrix2network(smx, ratio, knn)
+            net.items = smx.items
+            
+            results.update(dict(zip(uuids, zip(methods, res, projections, classifiers, attrs))))
+            smxs.append(smx)
+            nets.append(net)
+
+        if os.path.exists("%s.res" % s):
+            print "READING: %s" % s
+            res = pickle.load(open("%s.res" % s, 'rb'))
+            smx, labels, data = OWDistanceFile.readMatrix("%s.dst" % s)
+            smx.items = orange.ExampleTable("%s.tab" % s)
+            net = matrix2network(smx, ratio, knn)
+            net.items = smx.items
+            
+            results.update(res)
+            smxs.append(smx)
+            nets.append(net)
+            
+    return nets, smxs, results
+
+indices2 = orange.MakeRandomIndices2(p0=0.5)
+ind = indices2(data_c)
+train_c = data_c.select(ind, 1)
+test_c = data_c.select(ind, 0)
+train_d = data_d.select(ind, 1)
+test_d = data_d.select(ind, 0)
+
+attributes = getRandomAttributeSubsets(data_d.domain, LIMIT)
+attributes = attributes + [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+        
+#def construct_models(train_d, train_c, test_d, test_c):
+vizrs, nets_proj, smxs_proj, projs_points, methods, res = getProjections(input, train_c, attributes)
+nets_class, smxs_class, uuid_results = getClassifiers(input_classifier, train_d, attributes)
+
+out = getModelsExampleTable()
+uuid_results.update(res)
+
+results = []
+for i in range(len(nets_proj)):
+    for j in range(len(nets_proj[i].items)):
+        ex = orange.Example(out.domain)
+        ex["uuid"] = nets_proj[i].items[j]["uuid"].value
+        ex["number of attributes"] = nets_proj[i].items[j]["number of attributes"].value
+        ex["CA"] = nets_proj[i].items[j]["CA"].value
+        ex["AUC"] = nets_proj[i].items[j]["AUC"].value
+        ex["CA by class"] = nets_proj[i].items[j]["CA by class"].value
+        ex["cluster CA"] = -1
+        ex["attributes"] = nets_proj[i].items[j]["attributes"].value
+        ex["model"] = nets_proj[i].items[j]["model"].value
+        ex["cluster size"] = -1
+        out.append(ex)
+        
+    results.extend([vizrs[i][j][5].get("Results").results for j in range(len(nets_proj[i].items))])
+    
+        
+for i in range(len(nets_class)):
+    for j in range(len(nets_class[i].items)):
+        ex = orange.Example(out.domain)
+        ex["uuid"] = nets_class[i].items[j]["uuid"].value
+        ex["number of attributes"] = nets_class[i].items[j]["number of attributes"].value
+        ex["CA"] = nets_class[i].items[j]["CA"].value
+        ex["AUC"] = nets_class[i].items[j]["AUC"].value
+        ex["CA by class"] = nets_class[i].items[j]["CA by class"].value
+        ex["cluster CA"] = -1
+        ex["attributes"] = nets_class[i].items[j]["attributes"].value
+        ex["model"] = nets_class[i].items[j]["model"].value
+        ex["cluster size"] = -1
+        out.append(ex)
+        
+    results.extend([uuid_results[nets_class[i].items[j]["uuid"].value][1] for j in range(len(nets_class[i].items))])    
+
+smx_class, smx_prob = models2matrix(results)
+
+print "saving", '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim)
+saveSymMatrix(smx_prob, '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim), out)
+out.save('%s-allmodels-%d.tab' % (OUT_FILE, smx_prob.dim))
+nr = {}
+for ex in out:
+    uuid = ex['uuid'].value 
+    if uuid in uuid_results:
+        nr[uuid] = uuid_results[uuid]
+    else:
+        print uuid
+pickle.dump(nr, open('%s-allmodels-%d.res' % (OUT_FILE, smx_prob.dim), "wb"))
+    
+#construct_models(train_d, train_c, test_d, test_c)
+import orange
+import OWDistanceFile
+import orngClustering
+
+smx, lbl, data = OWDistanceFile.readMatrix(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-500-tau.dst')
+data = orange.ExampleTable(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-500-tau.tab')
+
+for i in range(smx.dim):
+    for j in range(i):
+        if smx[i,j] < 0:
+            smx[i,j] = 0
+            
+root = orange.HierarchicalClustering(smx, linkage=orange.HierarchicalClustering.Complete)
+
+def printClustering2(cluster):
+    if cluster.branches:
+        return "(%s%s)" % (printClustering2(cluster.left), printClustering2(cluster.right))
+    else:
+        return str(tuple(cluster))
+
+def prune(cluster, togo):
+    if cluster.branches:
+        if togo<0:
+            cluster.branches = None
+        else:
+            for branch in cluster.branches:
+                prune(branch, togo - cluster.height)
+
+#prune(root, 2)
+#printClustering2(root)
+nclusters = 20                
+clustered = orngClustering.hierarhicalClustering_topClustersMembership(root, nclusters)
+
+l = {}
+for ndx, c in enumerate(clustered):
+    l[c] = l[c] + [ndx] if c in l else [ndx]
+
+bestincluster = []
+for i, cluster in l.items():
+    best_val = 0
+    best_ndx = -1
+    
+    if len(cluster) < 5:
+        continue
+
+    for c in cluster:
+        if float(data[c]['vizrank']) > best_val:
+            best_val = float(data[c]['vizrank'])
+            best_ndx = c
+            
+    #print best_ndx, best_val
+    bestincluster.append(best_ndx)
+
+dim = len(bestincluster)    
+newsmx = orange.SymMatrix(dim)
+for i in range(dim):
+    for j in range(i):
+        newsmx[i,j] = smx[bestincluster[i], bestincluster[j]]
+
+newsmx.items = data.getitems(bestincluster)
+
+def saveSymMatrix(matrix, file):
+    fn = open(file + ".dst", 'w')
+    fn.write("%d labeled\n" % matrix.dim)
+    
+    for i in range(matrix.dim):
+        fn.write("%s" % matrix.items[i]['label'])
+        for j in range(i+1):
+            fn.write("\t%.6f" % matrix[i,j])
+        fn.write("\n")
+        
+    fn.close()
+    matrix.items.save(file + ".tab")
+
+saveSymMatrix(newsmx, r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-bestinclust-20-tau') 
+    

build_astra_map.py

+import Orange
+import orngVizRank as vr
+
+from tools import *
+from build_model_map import save_models, models2matrix, build_projection_model
+
+FOLDS = 10
+MODEL_LIMIT = 3000
+
+#data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/639_500FPRDK.tab")
+
+indices = Orange.core.MakeRandomIndicesCV(data_d, FOLDS, randseed=0, stratified=Orange.core.MakeRandomIndices.StratifiedIfPossible)
+##
+#attributes  = getRandomAttributeSubsets(data_d.domain, MODEL_LIMIT)
+#attributes += [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+##
+##attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-420.tab') if ex['model'].value != 'SCATTERPLOT']
+##attributes = set(attributes)
+##attributes = [attr.split(', ') for attr in attributes]
+##
+##
+models = []
+scatterplot_attributes = []
+for i in range(len(data_d.domain.attributes)):
+    for j in range(i):
+        scatterplot_attributes.append([data_d.domain.attributes[i].name, data_d.domain.attributes[j].name])
+
+print "attributes:", len(data_d.domain.attributes)
+print "attribute combinations:", len(scatterplot_attributes)
+random.shuffle(scatterplot_attributes)
+models.extend([build_projection_model(data_d, attrs, indices, vr.SCATTERPLOT) for attrs in scatterplot_attributes[:MODEL_LIMIT]])
+
+#for projection_type in [vr.LINEAR_PROJECTION, vr.RADVIZ, vr.POLYVIZ]:
+#    models.extend([build_projection_model(data_d, attrs, indices, projection_type) for attrs in attributes])
+
+models = [model for model in models if model is not None]
+smx_rank = models2matrix(models)
+
+save_models(models, smx_rank, '%s-%d' % (OUT_FILE, len(smx_rank)))

build_ensemble_map.py

+import Orange
+
+from tools import *
+from build_model_map import save_models, models2matrix
+
+#FOLDS = 10
+MODEL_LIMIT = 5000
+
+#data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/dermatology.tab")
+
+
+def get_attributes(node):
+    atts = []
+    if node.branchSelector:
+        a = node.branchSelector.classVar.name
+        atts.append(a)
+        for i in range(len(node.branches)):
+            if node.branches[i]:
+                atts.extend(get_attributes(node.branches[i]))
+    return atts
+
+def build_rf_models(data):
+    
+    tree = Orange.classification.tree.TreeLearner(storeNodeClassifier = 1, 
+                   storeContingencies=0, storeDistributions=1, minExamples=5, 
+                   storeExamples=1).instance()
+    gini = Orange.feature.scoring.Gini()
+    tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+    tree.maxDepth = 5
+    tree.split = Orange.ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
+    forestLearner = Orange.ensemble.forest.RandomForestLearner(learner=tree, trees=MODEL_LIMIT)
+    forestClassifier = forestLearner(data)
+    
+    models = []
+    for classifier in forestClassifier.classifiers:
+        probabilities, instance_predictions, instance_classes = [], [], []
+        for i in range(len(data)):
+            
+                ex = Orange.data.Instance(data[i])
+                ex.setclass("?")
+                cr = classifier(ex, Orange.core.GetBoth)
+                if cr[0].isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+                
+                probabilities.append(numpy.array(list(cr[1])))
+                instance_predictions.append(cr[0])
+                instance_classes.append(data[i].get_class())
+                
+        models.append({'method' : 'TREE', 
+                       'classifier' : classifier, 
+                       'probabilities' : probabilities, 
+                       'YAnchors' : None, 
+                       'XAnchors' : None, 
+                       'attributes': list(set(get_attributes(classifier.tree))),
+                       'instance_predictions' : instance_predictions,
+                       'instance_classes' : instance_classes})        
+    return models
+
+models = build_rf_models(data_d)
+smx_rank = models2matrix(models)
+save_models(models, smx_rank, '%s-%d' % (OUT_FILE, len(smx_rank)))

classifier2matrix.py

+import time
+import uuid
+import pickle
+import numpy
+
+import orange
+import orngTree
+import orngEnsemble
+import orngTest
+import orngStat
+
+from tools import *
+
+def getForestAttributes(node):
+    atts = []
+    if node.branchSelector:
+        a = node.branchSelector.classVar.name
+        atts.append(a)
+        for i in range(len(node.branches)):
+            if node.branches[i]:
+                atts.extend(getForestAttributes(node.branches[i]))
+    return atts
+
+def getAttributes(classifier):
+    if type(classifier).__name__ == "TreeClassifier":
+        return getForestAttributes(classifier.tree)
+    else:
+        return [var.name for var in classifier.domain.attributes]
+
+def classifier2matrix(data, method, classifiers, fn=None, labels=None):
+    results = [orngTest.testOnData([c], data) for c in classifiers]
+
+    cv = data.domain.classVar.name
+    resultsByClass = [[orngTest.testOnData([c], data.filter({cv : val})) for val in data.domain.classVar.values] for c in classifiers]
+    
+    out = getModelsExampleTable()
+    
+    model_classprobs = []
+    model_predictprobs = []
+    for i, result in enumerate(results):
+        model_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in result.results]))
+        model_predictprobs.append([numpy.array(res.probabilities[0]) for res in result.results])
+        attributes = list(set(getAttributes(classifiers[i])))
+        ex = orange.Example(out.domain)
+        ex['uuid'] = uuid.uuid4().hex
+        ex['model'] = MODEL_LIST[method]
+        ex['attributes'] = ", ".join(sorted(attributes))
+        ex['number of attributes'] = len(attributes)
+        ex['CA'] = orngStat.CA(result)[0]
+        ex['AUC'] = orngStat.AUC(result)[0]
+        ex['CA by class'] = ", ".join([str(orngStat.CA(res)[0]) for res in resultsByClass[i]])
+        ex['label'] = labels[i] if labels else MODEL_LIST[method]
+        out.append(ex)
+        
+    ##########################################################################
+    ## calculate projection distance matrices
+    print 'calculating model distance matrices,', len(model_classprobs), 'models'
+    dim = len(model_classprobs)
+    smx_class = orange.SymMatrix(dim)
+    smx_probs = orange.SymMatrix(dim)
+    
+    counter = 0
+    time_start = time.time()
+    
+    for i in range(dim):
+        for j in range(i+1, dim):
+            smx_class[i,j] = numpy.sum(numpy.power(model_classprobs[i] - model_classprobs[j], 2))
+            # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
+            smx_probs[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(model_predictprobs[i],model_predictprobs[j])])
+                                         
+            counter += 1
+            if counter % 5000 == 0:
+                time_elapsed = time.time() - time_start
+                time_total = time_elapsed / counter * dim * (dim - 1) / 2
+                time_remainng = int(time_total - time_elapsed)
+                print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+                
+    RV = ([method for ex in out], [ex["uuid"].value for ex in out], [r.results for r in results], \
+          [None for ex in out], classifiers, [ex["attributes"].value.split(', ') for ex in out])
+    
+    if fn:
+        #saveSymMatrix(smx_class, root + out_file + "-" + "tree" + '-' + str(dim) + '-class', out)
+        saveSymMatrix(smx_probs, '%s-%d' % (fn, dim), out)
+        out.save('%s-%d.tab' % (fn, dim))
+        
+        for i, ex in enumerate(out):
+            if str(ex["model"].value) == "SVM":
+                classifiers[i] = None
+        
+        pickle.dump(dict(zip([ex["uuid"].value for ex in out], \
+                             zip([method for ex in out], [r.results for r in results], \
+                                 [None for ex in out], classifiers, \
+                                 [ex["attributes"].value.split(', ') for ex in out]))), \
+                                 open('%s-%d.res' % (fn, dim), 'wb'))
+    
+    smx_probs.items = out
+    return smx_probs, RV
+import os.path
+import pickle
+
+from tools import *
+from matrix2network import *
+
+
+def clusterModel(fn, knn, ratio=0.0):
+    print "CLUSTERING:", os.path.split(fn)[1]
+    smx = loadModel(fn)
+    net = matrix2network(smx, ratio, knn)
+    net.items = smx.items
+    msmx = cluster2matrix2(net, smx)
+    saveModel(msmx, "%s-clustered" % fn)
+
+#clusterModel(ROOT + "dst/breast-knn-510"    , 2)
+#clusterModel(ROOT + "dst/breast-tree-500"   , 1)
+#clusterModel(ROOT + "dst/breast-bayes-510"  , 2)
+#clusterModel(ROOT + "dst/breast-svms-510"   , 4)
+#clusterModel(ROOT + "dst/breast-polyviz-501", 2)
+#clusterModel(ROOT + "dst/breast-radviz-501" , 2)
+
+#clusterModel(ROOT + "dst/breast-linproj-501", 2)
+clusterModel(ROOT + "dst/zoo-rf-1000"    , 1)

compare_matrices.py

+#import orange
+import OWDistanceFile
+#import scipy.stats
+
+smx1, lbl1, data1 = OWDistanceFile.readMatrix(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-10-spearman.dst')
+smx2, lbl2, data2 = OWDistanceFile.readMatrix(r'c:\Users\miha\Projects\res\metamining\dst\zoo-projections-10-pearson.dst')
+k = 5
+c = []
+taus = []
+for i in range(smx1.dim):
+    neighbours1 = [(smx1[i,j], j) for j in range(smx1.dim) if i != j]
+    knn1 = set([b for a,b in sorted(neighbours1)[:k]])
+    neighbours2 = [(smx2[i,j], j) for j in range(smx2.dim) if i != j]
+    knn2 = set([b for a,b in sorted(neighbours2)[:k]])
+    #c.append(len(knn1.intersection(knn2)) / float(len(knn1.union(knn2))))
+    c.append(len(knn1.intersection(knn2)) / float(k))
+    
+
+        
+print sum(c) / float(len(c))
+import time
+import uuid
+import pickle
+import numpy
+
+import orange
+import orngTree
+import orngEnsemble
+import orngTest
+import orngStat
+
+from tools import *
+
+root = "C:\\Users\\miha\\Projects\\res\\metamining\\"
+#root = "/home/miha/metamining/"
+out_file = 'dst/primary'
+method = 5
+data = orange.ExampleTable(root + 'tab/primary-c.tab')
+TREE_LIMIT = 15
+
+tree = orngTree.TreeLearner(storeNodeClassifier = 0, storeContingencies=0, \
+  storeDistributions=1, minExamples=5, storeExamples=1).instance()
+gini = orange.MeasureAttribute_gini()
+tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+tree.maxDepth = 4
+tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)
+
+forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
+forest = forestLearner(data)
+
+results = [orngTest.testOnData([c], data) for c in forest.classifiers]
+
+out = getModelsExampleTable()
+
+def getAttributes(node):
+    atts = []
+    if node.branchSelector:
+        a = node.branchSelector.classVar.name
+        atts.append(a)
+        for i in range(len(node.branches)):
+            if node.branches[i]:
+                atts.extend(getAttributes(node.branches[i]))
+    return atts
+
+model_classprobs = []
+model_predictprobs = []
+for i, result in enumerate(results):
+    model_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in result.results]))
+    model_predictprobs.append([numpy.array(res.probabilities[0]) for res in result.results])
+    attributes = list(set(getAttributes(forest.classifiers[i].tree)))
+    ex = orange.Example(out.domain)
+    ex['uuid'] = uuid.uuid4().hex
+    ex['model'] = MODEL_LIST[method]
+    ex['attributes'] = ", ".join(attributes)
+    ex['number of attributes'] = len(attributes)
+    ex['score'] = orngStat.CA(result)[0]
+    out.append(ex)
+    
+##########################################################################
+## calculate projection distance matrices
+print 'calculating projection distance matrices,', len(model_classprobs), 'models'
+dim = len(model_classprobs)
+smx_class = orange.SymMatrix(dim)
+smx_probs = orange.SymMatrix(dim)
+
+counter = 0
+time_start = time.time()
+
+for i in range(dim):
+    for j in range(i+1, dim):
+        smx_class[i,j] = numpy.sum(numpy.power(model_classprobs[i] - model_classprobs[j], 2))
+        # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
+        smx_probs[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(model_predictprobs[i],model_predictprobs[j])])
+                                     
+        counter += 1
+        if counter % 500 == 0:
+            time_elapsed = time.time() - time_start
+            time_total = time_elapsed / counter * dim * (dim - 1) / 2
+            time_remainng = int(time_total - time_elapsed)
+            print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+    
+saveSymMatrix(smx_class, root + out_file + "-" + "tree" + '-' + str(dim) + '-class', out)
+saveSymMatrix(smx_probs, root + out_file + "-" + "tree" + '-' + str(dim) + '-prob', out)
+out.save(root + out_file + "-" + "tree" + '-' + str(dim) + '.tab')
+output = open(root + out_file + "-" + "tree" + '-' + str(dim) + '.res', 'wb')
+pickle.dump((method, [ex["uuid"].value for ex in out], results, None, forest.classifiers), output)
+output.close()
+from tools import *
+from classifier2matrix import *
+
+print "kNN: calculating classifiers"
+
+data_d = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
+method = 7
+
+distanceConstructors = [orange.ExamplesDistanceConstructor_Euclidean(),
+                        orange.ExamplesDistanceConstructor_Manhattan(),
+                        orange.ExamplesDistanceConstructor_Hamming(),
+                        orange.ExamplesDistanceConstructor_Maximal()]
+ks = [5, 9, 15]
+
+print 'reading results'
+modelData = orange.ExampleTable(ROOT + "dst/breast-allmodels-283.tab")
+CAs, attributes = zip(*sorted([(ex['CA'].value, ex['label'].value.split(', ')) for ex in modelData])[-50:])
+
+print 'constructing classifiers'
+classifiers = []
+labels = []
+for atts in attributes:
+    for k in ks:
+        exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
+        data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
+        
+        for distanceConstructor in distanceConstructors:
+            knn = orange.kNNLearner()
+            knn.k = k
+            knn.distanceConstructor = distanceConstructor
+            classifiers.append(knn(data2))
+            labels.append('KNN %s k=%d' % (str(distanceConstructor).split(' ')[0][29:], k))
+
+print 'classifiers to matrix'
+smx, RV = classifier2matrix(data_d, method, classifiers, OUT_FILE + "-knns", labels)
+methods, uuids, res, projections, classifiers, attrs = RV
+

matrix2network.py

+import orange
+import orngNetwork
+import OWDistanceFile
+
+from tools import *
+
+def matrix2network(name, ratio, kNN):
+    if type(name) == type(""):
+        dstFile = name + "-knnpredict.dst"
+        tabFile = name + ".tab"
+        netFile = name + "-knnpredict"
+        smx, labels, data = OWDistanceFile.readMatrix(dstFile)
+        net = orngNetwork.Network(smx.dim, 0)
+        lower, upper = net.getDistanceMatrixThreshold(smx, ratio)
+        net.fromDistanceMatrix(smx, 0, upper, kNN, 0)
+        net.items = orange.ExampleTable(tabFile)
+        net.save(netFile)
+    else:
+        smx = name
+        net = orngNetwork.Network(smx.dim, 0)
+        lower, upper = net.getDistanceMatrixThreshold(smx, ratio)
+        net.fromDistanceMatrix(smx, 0, upper, kNN, 0)
+        #net.items = smx.items.getitems()
+        
+    return net
+
+#net_linproj     = matrix2network("primary-linproj-494"    , 0.01, 1)
+#net_polyviz     = matrix2network("primary-polyviz-494"    , 0.01, 1)
+#net_radviz      = matrix2network("primary-radviz-494"     , 0.01, 1)
+#net_scatterplot = matrix2network("primary-scatterplot-253", 0.00, 1)
+
+#smx, labels, data = OWDistanceFile.readMatrix(dstroot + "primary-scatterplot-253-knnpredict.dst")
+#net = orngNetwork.Network(smx.dim, 0)
+#lower, upper = net.getDistanceMatrixThreshold(smx, 0.05)
+#print upper
+
+#############################################################################
+# best in cluster 2 network
+
+def cluster2matrix(net, name):
+    if type(name) == type(""):
+        dstFile = name + "-knnpredict.dst"
+        smx, labels, data = OWDistanceFile.readMatrix(dstFile)
+    else:
+        smx = name
+    lbls = net.clustering.labelPropagation()
+    clusters = set(lbls)
+    medians = []
+    csizes = []
+    bests = []
+    for c in clusters:
+        cndxs = [i for i, ci in enumerate(lbls) if ci == c]
+        cmatrix = smx.getitems(cndxs)
+        cdsts  = zip([sum([j for j in i]) for i in cmatrix], cndxs, [net.items[i]['CA'].value for i in cndxs])
+        max_score = max([net.items[i]['CA'].value for i in cndxs])
+        cmedian = min(cdsts)[1]
+        medians.append((cmedian, max_score, len(cndxs)))
+
+    medians.sort()
+    medians, bests, csizes = map(list, zip(*medians))
+    medianmatrix = smx.getitems(medians)
+    medianmatrix.items = net.items.getitems(medians)
+    if type(name) == type(""):
+        saveSymMatrix(medianmatrix, dstroot + "medians-" + name + "-" + str(medianmatrix.dim) + "-knnpredict", None, True)
+    return medianmatrix, medians, csizes, bests
+
+def cluster2matrix2(net, smx):
+    lbls = net.clustering.labelPropagation()
+    clusters = set(lbls)
+    medians = []
+    csizes = []
+    bests = []
+    for c in clusters:
+        cndxs = [i for i, ci in enumerate(lbls) if ci == c]
+        cmatrix = smx.getitems(cndxs)
+        cdsts  = zip([sum([j for j in i]) for i in cmatrix], cndxs, [net.items[i]['CA'].value for i in cndxs])
+        max_score = max([net.items[i]['CA'].value for i in cndxs])
+        cmedian = min(cdsts)[1]
+        medians.append((cmedian, max_score, len(cndxs)))
+
+    medians.sort()
+    medians, bests, csizes = map(list, zip(*medians))
+    medianmatrix = smx.getitems(medians)
+    medianmatrix.items = net.items.getitems(medians)
+    medianmatrix.results = {}
+    
+    for i in range(len(medianmatrix.items)):
+        medianmatrix.items[i]["cluster size"] = csizes[i]
+        medianmatrix.items[i]["cluster CA"] = bests[i]
+        uuid = medianmatrix.items[i]["uuid"].value
+        medianmatrix.results[uuid] = smx.results[uuid]
+    
+    return medianmatrix
+
+#cluster2matrix(net_linproj,     "primary-linproj-494")
+#cluster2matrix(net_polyviz,     "primary-polyviz-494")
+#cluster2matrix(net_radviz,      "primary-radviz-494")
+#cluster2matrix(net_scatterplot, "primary-scatterplot-253")
+    
+    
+import os.path
+import orange
+
+from tools import *
+
+
+input = [ROOT + "dst/breast-scatterplot-36"       ,
+         ROOT + "dst/breast-knn-510-clustered"    ,
+         ROOT + "dst/breast-tree-500-clustered"   ,
+         ROOT + "dst/breast-bayes-510-clustered"  , 
+         ROOT + "dst/breast-svms-510-clustered"   , 
+         ROOT + "dst/breast-polyviz-501-clustered", 
+         ROOT + "dst/breast-radviz-501-clustered" , 
+         ROOT + "dst/breast-linproj-501-clustered"]
+
+models = []
+
+for fn in input:
+    print "READING:", os.path.split(fn)[1]
+    models.append(loadModel(fn))
+    
+    
+mdata = orange.ExampleTable(models[0].items.domain)
+results = {}
+
+for model in models:
+    mdata.extend(model.items)
+    results.update(model.results)
+    
+# projections have different results than classification models
+vizrResults = [results[ex["uuid"].value][1] if \
+               type(results[ex["uuid"].value][1]) == type([]) else \
+               results[ex["uuid"].value][1][5].get("Results").results for ex in mdata]
+
+smx_class, smx_prob = models2matrix(vizrResults)
+
+smx_prob.items = mdata
+smx_prob.results = results
+
+saveModel(smx_prob, "%sdst/breast-merged-%d" % (ROOT, smx_prob.dim))

mm/__init__.py

Empty file added.
+import uuid
+import pickle
+import itertools
+import scipy.stats
+
+import Orange
+import orngVizRank as vr
+
+from tools import *
+from operator import itemgetter
+
+FOLDS = 10
+MODEL_LIMIT = 500
+
+#data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/zoo.tab")
+
+def build_model(learner, data, indices):
+    probabilities = []
+    instance_predictions = []
+    instance_classes = []
+    res = []
+    # estimate class probabilities using CV
+    for fold in range(FOLDS):
+        learnset = data.selectref(indices, fold, negate=1)
+        testset = data.selectref(indices, fold, negate=0)
+        classifier = learner(learnset)
+        tcn = 0
+        for i in range(len(data)):
+            if (indices[i] == fold):
+                ex = Orange.data.Instance(testset[tcn])
+                ex.setclass("?")
+
+                cr = classifier(ex, Orange.core.GetBoth)
+                if cr[0].isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+
+                probabilities.append(numpy.array(list(cr[1])))
+                instance_predictions.append(cr[0])
+                instance_classes.append(testset[tcn].get_class())
+                tcn += 1
+
+    return {'method' : type(learner).__name__,
+            'classifier' : learner(data),
+            'probabilities' : probabilities,
+            'XAnchors' : None,
+            'YAnchors' : None,
+            'attributes': [x.name for x in data.domain.attributes],
+            'instance_predictions' : instance_predictions,
+            'instance_classes' : instance_classes}
+
+def build_projection_model(data, attributes, indices, visualizationMethod=vr.LINEAR_PROJECTION):
+    method = "?"
+    if visualizationMethod == vr.SCATTERPLOT:
+        import orngScaleScatterPlotData
+        graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
+        method = "SCATTERPLOT"
+    elif visualizationMethod == vr.RADVIZ:
+        import orngScaleLinProjData
+        graph = orngScaleLinProjData.orngScaleLinProjData()
+        graph.normalizeExamples = 1
+        method = "RADVIZ"
+    elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
+        import orngScaleLinProjData
+        from orngLinProj import FreeViz
+        graph = orngScaleLinProjData.orngScaleLinProjData()
+        graph.normalizeExamples = 0
+        method = "SPCA"
+    elif visualizationMethod == vr.POLYVIZ:
+        import orngScalePolyvizData
+        graph = orngScalePolyvizData.orngScalePolyvizData()
+        graph.normalizeExamples = 1
+        method = "POLYVIZ"
+    else:
+        print "an invalid visualization method was specified. VizRank can not run."
+        return
+
+    graph.setData(data, graph.rawSubsetData)
+    attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
+    domain = Orange.data.Domain([orange.FloatVariable("xVar"), orange.FloatVariable("yVar"), orange.EnumVariable(graph.dataDomain.classVar.name, values=getVariableValuesSorted(graph.dataDomain.classVar))])
+    classListFull = graph.originalData[graph.dataClassIndex]
+    table = None
+
+    if visualizationMethod == vr.LINEAR_PROJECTION:
+        freeviz = FreeViz(graph)
+        projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
+        if projections != None:
+            XAnchors, YAnchors, (attrNames, newIndices) = projections
+            table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
+        else:
+            print 'a null projection found'
+    elif visualizationMethod == vr.SCATTERPLOT:
+        XAnchors = YAnchors = None
+        table = graph.createProjectionAsExampleTable(attrIndices)
+    else:
+        XAnchors = graph.createXAnchors(len(attrIndices))
+        YAnchors = graph.createYAnchors(len(attrIndices))
+        validData = graph.getValidList(attrIndices)
+        # more than min number of examples
+        if numpy.sum(validData) >= 10:
+            classList = numpy.compress(validData, classListFull)
+            selectedData = numpy.compress(validData, numpy.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+            sum_i = graph._getSum_i(selectedData)
+            table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+
+    if not table: return None
+
+    probabilities = []
+    instance_predictions = []
+    instance_classes = []
+    learner = orange.kNNLearner(k=10, rankWeight=0, distanceConstructor=orange.ExamplesDistanceConstructor_Euclidean(normalize=0))
+    for fold in range(FOLDS):
+        learnset = table.selectref(indices, fold, negate=1)
+        testset = table.selectref(indices, fold, negate=0)
+        classifier = learner(learnset)
+        tcn = 0
+        for i in range(len(data)):
+            if (indices[i] == fold):
+                ex = Orange.data.Instance(testset[tcn])
+                ex.setclass("?")
+
+                cr = classifier(ex, Orange.core.GetBoth)
+                if cr[0].isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+                probabilities.append(numpy.array(list(cr[1])))
+                instance_predictions.append(cr[0])
+                instance_classes.append(testset[tcn].get_class())
+                tcn += 1
+
+    classifier = learner(table)
+    return {'method' : method,
+            'classifier' : classifier,
+            'probabilities' : probabilities,
+            'XAnchors' : XAnchors,
+            'YAnchors' : YAnchors,
+            'attributes': attributes,
+            'instance_predictions' : instance_predictions,
+            'instance_classes' : instance_classes}
+
+def build_rf_models(data):
+    probabilities = [[] for fold in FOLDS]
+
+    # estimate class probabilities using CV
+    for fold in range(FOLDS):
+        learnset = data.selectref(indices, fold, negate=1)
+        testset = data.selectref(indices, fold, negate=0)
+
+        tree = Orange.classification.tree.TreeLearner(storeNodeClassifier=1,
+                   storeContingencies=0, storeDistributions=1, minExamples=5,
+                   storeExamples=1).instance()
+        gini = Orange.feature.scoring.Gini()
+        tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+        tree.maxDepth = 4
+        tree.split = Orange.ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
+        forestLearner = Orange.ensemble.forest.RandomForestLearner(learner=tree, trees=MODEL_LIMIT)
+        forestClassifier = forestLearner(learnset)
+
+        for classifier in forestClassifier.classifiers:
+            tcn = 0
+            for i in range(len(data)):
+                if (indices[i] == fold):
+                    ex = Orange.data.Instance(testset[tcn])
+                    ex.setclass("?")
+                    tcn += 1
+                    cr = classifier(ex, Orange.core.GetBoth)
+                    if cr[0].isSpecial():
+                        raise "Classifier %s returned unknown value" % (classifier.name)
+                    probabilities.append(cr)
+    model_classifier = learner(data)
+    model_classifier.probabilities = probabilities
+
+def get_learner(type, data):
+    learner = None
+    #if type.upper() == "TREE":
+    #learner = orange.BayesLearner()
+    #learner = orange.kNNLearner(k=int(math.sqrt(len(data))))
+
+    return learner
+
+def _print_time(time_start, iter, numiter):
+    if iter % 10000 == 0:
+        time_elapsed = time.time() - time_start
+        time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
+        time_remainng = int(time_total - time_elapsed)
+        print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+
+def models2matrix(models):
+    dim = len(models)
+    print "%d models to matrix -- rank" % dim
+
+    #smx_prob = numpy.zeros(shape=(dim, dim))
+    #smx_class = numpy.zeros(shape=(dim, dim))
+    smx_rank = numpy.zeros(shape=(dim, dim))
+    #smx_rank_None = numpy.zeros(shape=(dim, dim))
+    ninstances = len(models[0]['probabilities'])
+    normalization_factor = 2 * ninstances
+
+    counter = 0
+    time_start = time.time()
+    instance_predictions = [numpy.array([pred.value for pred in model['instance_predictions']]) for model in models]
+    #model_probs = [model['probabilities'] for model in models]
+    for i in range(dim):
+        for j in range(i):
+            w = numpy.average(instance_predictions[i] !=
+                                           instance_predictions[j])
+
+            #w = sum([numpy.sum(numpy.power(p1 - p2, 2)) for \
+            #            (p1, p2) in zip(model_probs[i], 
+            #               model_probs[j])]) / normalization_factor
+
+            #smx_rank[i,j] = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
+            #            (p1, p2) in zip(models[i]['probabilities'], 
+            #               models[j]['probabilities'])]) / ninstances)
+
+            #smx_rank_0[i,j] = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
+            #smx_rank_1[i,j] = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
+            #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
+            smx_rank[i, j] = 1 if math.isnan(w) else w
+
+            counter += 1
+            _print_time(time_start, counter, dim)
+    #return smx_prob, smx_class, smx_rank
+    return smx_rank
+
+def save_models(models, smx, fn):
+    print 'saving matrix'
+    if type(smx) == type([]):
+        for s, title in smx:
+            numpy.save('%s-%s' % (fn, title), s)
+    else:
+        numpy.save('%s' % (fn), smx)
+
+    print 'build out data'
+    out = getModelsExampleTable()
+    uuids = []
+    for model in models:
+        ex = Orange.data.Instance(out.domain)
+        _uuid = uuid.uuid4().hex
+        uuids.append(_uuid)
+        ex['uuid'] = _uuid
+        ex['number of attributes'] = len(model['attributes'])
+        results = [p == c for p, c in zip(model['instance_predictions'], model['instance_classes'])]
+        ex['CA'] = sum(results) / float(len(results))
+        ex['model'] = model['method']
+        ex['attributes'] = ', '.join(model['attributes'])
+        #ex["AUC"] = nets[i].items[m]["AUC"].value
+        resultsByClass = sorted([(p == c, c) for p, c in zip(model['instance_predictions'], model['instance_classes'])], key=itemgetter(1))
+        groups = []
+        for _k, g in itertools.groupby(resultsByClass, lambda x: x[1].value):
+            resultsByClass, _classes = zip(*g)
+            groups.append(resultsByClass)
+        ex["CA by class"] = ', '.join([str(sum(results) / float(len(results))) for results in groups])
+        #ex["cluster CA"] = best_indices[i][j]
+        #ex["cluster size"] = median_csizes[i][j]
+        ex["label"] = model['method']
+        out.append(ex)
+
+    print 'saving out data'
+    out.save('%s.tab' % (fn))
+    print 'saving models'
+    pickle.dump(dict(zip(uuids, models)), open('%s.res' % (fn), "wb"))
+
+
+#indices = Orange.core.MakeRandomIndicesCV(data_d, FOLDS, randseed=0, stratified=Orange.core.MakeRandomIndices.StratifiedIfPossible)
+##
+#attributes  = getRandomAttributeSubsets(data_d.domain, MODEL_LIMIT)
+#attributes += [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+##
+##attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-420.tab') if ex['model'].value != 'SCATTERPLOT']
+##attributes = set(attributes)
+##attributes = [attr.split(', ') for attr in attributes]
+##
+##
+#models = []
+#scatterplot_attributes = []
+#for i in range(len(data_d.domain.attributes)):
+#    for j in range(i):
+#        scatterplot_attributes.append([data_d.domain.attributes[i].name, data_d.domain.attributes[j].name])
+#        
+##random.shuffle(scatterplot_attributes)
+#models.extend([build_projection_model(data_d, attrs, indices, vr.SCATTERPLOT) for attrs in scatterplot_attributes])
+#
+#for projection_type in [vr.LINEAR_PROJECTION, vr.RADVIZ, vr.POLYVIZ]:
+#    models.extend([build_projection_model(data_d, attrs, indices, projection_type) for attrs in attributes])
+#
+#models = [model for model in models if model is not None]
+#smx_prob, smx_class, smx_rank = models2matrix(models)
+#
+#save_models(models, [(smx_prob, 'prob'), (smx_class, 'class'), (smx_rank, 'rank')], '%s-%d' % (OUT_FILE, len(smx_prob)))

model_map_similarity.py

+import numpy
+import Orange
+
+from operator import itemgetter
+from tools import *
+
+print 'loading...'
+
+fileName = 'zoo-1603'
+fileCommon = ROOT + '_explore_/' + fileName
+fileA = fileCommon + '-rank'
+#fileB = ROOT + 'new/zoo-allmodels-420'
+fileB = fileCommon + '-class'
+
+# warning saved matrix in lower-diagonal!
+modelA = numpy.load('%s.npy' % fileA)
+# from lower-diagonal build symmetric 
+modelA = modelA + modelA.transpose()
+itemsA = Orange.data.Table('%s.tab' % fileCommon)
+
+modelB = numpy.load('%s.npy' % fileB)
+modelB = modelB + modelB.transpose()
+#smxB = loadModel(fileB)
+#modelB = numpy.zeros((smxB.dim, smxB.dim))
+#for i in range(smxB.dim):
+#    for j in range(smxB.dim):
+#        modelB[i,j] = smxB[i,j]
+        
+itemsB = Orange.data.Table('%s.tab' %  fileCommon)
+
+def compare_model_similarity(modelA, itemsA, modelB, itemsB):
+    print len(modelA), 'read in model A,', len(modelB), 'read in model B'
+    
+    print 'matching...'
+    matchA = sorted((ex['model'].value + ', '.join(sorted(ex['attributes'].value.split(', '))), i) for i, ex in enumerate(itemsA))
+    matchB = sorted((ex['model'].value + ', '.join(sorted(ex['attributes'].value.split(', '))), i) for i, ex in enumerate(itemsB))
+    
+    i,j = 0,0
+    matches = []
+    # warning! this works only if list values are unique (which they are in my case :)
+    while i < len(matchA) and j < len(matchB):
+        mA, iA = matchA[i]
+        mB, iB = matchB[j]
+    
+        if mA == mB:
+            matches.append((iA,iB))
+            i += 1
+            j += 1 
+        elif mA < mB:
+            i += 1
+        else:
+            j += 1
+    
+    print len(matches), 'matched'
+    indA, indB = zip(*matches)
+    
+    matrixA = modelA.take(indA, axis=0).take(indA, axis=1)
+    matrixB = modelB.take(indB, axis=0).take(indB, axis=1)
+    
+    nnA = numpy.argsort(matrixA)
+    nnB = numpy.argsort(matrixB)
+
+    print 'comparing...'
+    scores = []
+    for k in range(2, len(nnA)+1):
+        nnA_tmp = nnA[:,:k]
+        nnB_tmp = nnB[:,:k]
+        count = 0
+        for i in range(len(nnA)):
+            count += len(set(nnA_tmp[i]).intersection(set(nnB_tmp[i]))) - 1
+
+        scores.append(count / float((k-1)*len(nnA)))
+        if k % 100 == 0:
+            print k
+    return scores
+
+def plt(x,y,fn):
+    import matplotlib.pyplot as plt
+
+    plt.title('')
+    plt.xlabel('k-neighbors')
+    plt.ylabel('similarity')
+    plt.grid(True)
+    
+    plt.plot(x, y, linewidth=1.0)
+    
+    plt.savefig(fn)
+    
+scores = compare_model_similarity(modelA, itemsA, modelB, itemsB)
+
+plt(range(1, len(scores[:50])+1), scores[:50], '%s%s-similarity-50.png' % (ROOT, fileName) )
+plt(range(1, len(scores)+1), scores, '%s%s-similarity.png' % (ROOT, fileName) )
+
+print 'saving results...'
+fp = file(ROOT + 'similarity_results.txt', 'a')
+fp.write('%s-class;%s-prob;%s\n' % (fileName, fileName, ';'.join(str(s) for s in scores)))
+fp.close()
+import pickle
+import orange
+
+data = orange.ExampleTable(r"c:\Python26\Lib\site-packages\orange\doc\datasets\primary-tumor.tab")
+
+##############################################################################
+## preprocess Data set
+transformer = orange.DomainContinuizer()
+transformer.multinomialTreatment = orange.DomainContinuizer.NValues
+transformer.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan
+transformer.classTreatment = orange.DomainContinuizer.Ignore
+domain = transformer(data)
+
+classifier = pickle.load(open('classifier.pkl', 'rb'))
+pickle.dump(classifier, open('tmp.pkl', "wb"))

projection_dss.py

+import orange
+import orngClustering
+import OWDistanceFile
+
+root = "c:\\Users\\miha\\Projects\\res\\metamining\\"
+in_file = root + "dst\\zoo-projections-500-abs.dst"
+
+smx, labels, data = OWDistanceFile.readMatrix(in_file)
+data = orange.ExampleTable(root + 'tab\\zoo-projections-500.tab')
+
+# normalize to interval [0,1]
+smx.normalize(0)
+# invert 1 - X
+smx.invert(1)
+
+c = orange.HierarchicalClustering(smx, linkage=orange.HierarchicalClustering.Average)
+depth = 10
+min_projections = 20
+
+clusters = []
+def findProjections(cluster, l):
+    level = l + 1
+    if cluster.branches and level <= depth and len(cluster) > min_projections:
+        findProjections(cluster.left, level)
+        findProjections(cluster.right, level)
+    else:
+        clusters.append(cluster)
+
+findProjections(c, 0)
+
+include = []
+for cluster in clusters:
+    scores = [(data[c]['vizrank'].value, data[c]['number of attributes'].value, c) for c in cluster]
+    scores.sort()
+    include.append(scores[-1][2])
+
+new_smx = orange.SymMatrix(len(include))
+for i in range(new_smx.dim):
+    for j in range(i):
+        new_smx[i,j] = smx[include[i], include[j]]
+new_smx.items = data.getitems(include)
+
+def saveSymMatrix(matrix, file):
+    fn = open(file + ".dst", 'w')
+    fn.write("%d labeled\n" % matrix.dim)
+    
+    for i in range(matrix.dim):
+        fn.write("%s" % matrix.items[i]['label'])
+        for j in range(i+1):
+            fn.write("\t%.6f" % matrix[i,j])
+        fn.write("\n")
+        
+    fn.close()
+    matrix.items.save(file + ".tab")
+
+saveSymMatrix(new_smx, root + 'projections-dss')

projections2matrix.py

+import time
+import math
+import uuid
+import pickle
+
+import numpy
+import scipy.stats
+
+import orange
+import orngClustering
+import orngVizRank as vr
+import orngStat
+import orngTest
+
+from tools import *
+          
+def calculateProjections(data, method, projectionLimit=10, attributes=None):
+    """initialize VizRank and evaluate projections"""
+
+    print "%s: calculating projections" % MODEL_LIST[method]
+    vizr = vr.VizRank(method)
+    if method == vr.LINEAR_PROJECTION:
+        vizr.projOptimizationMethod = 1
+    else:
+        vizr.projOptimizationMethod = 0
+    vizr.setData(data)
+    vizr.projectionLimit = projectionLimit
+    vizr.attributeCount =  9
+    vizr.storeEachPermutation = 1
+    vizr.optimizationType = vr.MAXIMUM_NUMBER_OF_ATTRS
+    vizr.saveEvaluationResults = 1
+    vizr.attrCont = vr.CONT_MEAS_NONE
+    #vizr.attrCont = vr.CONT_MEAS_S2NMIX
+    vizr.attrDisc = vr.DISC_MEAS_NONE
+    vizr.attrSubsetSelection = vr.DETERMINISTIC_ALL
+    
+    if attributes:
+        evaluateProjections(vizr, attributes)
+    else:
+        vizr.evaluateProjections()
+        
+    return vizr
+
+def processProjections(data, vizr_results, projection_points=[], method=None, scaleProjData=None):
+    """calculate projection distance matrices"""
+    
+    if projection_points == [] and scaleProjData == None:
+        print "Error: either projection_points or scaleProjData must be given."
+        return
+    
+    if projection_points != [] and scaleProjData != None:
+        print "Warning: projection_points and scaleProjData both given. scaleProjData will be ignored."
+        scaleProjData = None
+    
+    out = getModelsExampleTable()
+    
+    #preprocess projections
+    attributeset = set()
+    todelete = []
+    print "constructing example table"
+    #projection_distance_matrices = []
+    #projection_distances = []
+    projection_classprobs = []
+    projection_predictprobs = []
+    
+    counter = 0
+    time_start = time.time()
+    for ndx, r in enumerate(vizr_results):
+        lbl = ', '.join(sorted(r[3]))