Source

orange-modelmaps / archive / metamining.py-bad

Full commit
import os
import os.path
import gc
import pickle
import random

import orngNetwork
import OWDistanceFile

from tools import *
from projections2matrix import *
from matrix2network import *
from classifier2matrix import *

LIMIT = 1000
PROJECTION_LIMIT = LIMIT
TREE_LIMIT = LIMIT
TREE_FILE = "breast-tree"
#TREE_FILE = "zoo-tree"

#ROOT_DATA = "C:\\Python26\\Lib\\site-packages\\orange\\doc\\datasets\\"
#ROOT = "C:\\Users\\miha\\Projects\\res\\metamining\\"
#ROOT_DATA = "/home/miha/metamining/"
#ROOT = "/home/miha/metamining/"
#OUT_FILE = ROOT + "dst/breast"
#OUT_FILE = ROOT + "dst/zoo"

#data = getData(ROOT_DATA + "primary-tumor.tab", 1)
#data_c = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
#data_d = getData(ROOT + "tab/breast-cancer-wisconsin-c.tab")
data_c = getData(ROOT + "tab/zoo-c.tab")
data_d = getData(ROOT + "tab/zoo.tab")
#
input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
         (vr.RADVIZ           , "radviz"     , 0.00, 1),
         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 

#input_classifier = [("knn"  , 0.00, 1)]
#                    ("bayes", 0.00, 1),
#                    ("tree" , 0.00, 1)]
##############################################################################
# BREAST: READ LIMIT 100
##############################################################################
#input = [(ROOT + "dst/breast-scatterplot-36", 0.00, 0),
#         (ROOT + "dst/breast-linproj-101"   , 0.00, 2),
#         (ROOT + "dst/breast-radviz-101"    , 0.00, 2),
#         (ROOT + "dst/breast-polyviz-101"   , 0.00, 2)] 
#
#input_classifier = [(ROOT + "dst/breast-tree-110" , 0.00, 1),
#                    (ROOT + "dst/breast-knn-110"  , 0.00, 2),
#                    (ROOT + "dst/breast-bayes-110", 0.00, 2)]
#                    (ROOT + "dst/breast-svms-510" , 0.00, 4)]
##############################################################################
# BREAST: READ LIMIT 500
##############################################################################
#input = [(ROOT + "dst/breast-scatterplot-36", 0.00, 0),
#         (ROOT + "dst/breast-linproj-501"   , 0.00, 2),
#         (ROOT + "dst/breast-radviz-501"    , 0.00, 2),
#         (ROOT + "dst/breast-polyviz-501"   , 0.00, 2)] 
#
#input_classifier = [(ROOT + "dst/breast-tree-500" , 0.00, 1),
#                    (ROOT + "dst/breast-knn-510"  , 0.00, 2),
#                    (ROOT + "dst/breast-bayes-510", 0.00, 2),
#                    (ROOT + "dst/breast-svms-510" , 0.00, 4)]
##############################################################################
# ZOO: READ LIMIT 100
##############################################################################
input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 1),
         (vr.RADVIZ           , "radviz"     , 0.00, 1),
         (vr.POLYVIZ          , "polyviz"    , 0.00, 1)] 

input_classifier = [("tree" , 0.00, 1),
                    ("bayes", 0.00, 1),
                    ("knn"  , 0.00, 1)]

#input = [("dst/zoo-scatterplot-120", 0.00, 1),
#         ("dst/zoo-linproj-93"     , 0.00, 1),
#         ("dst/zoo-radviz-100"     , 0.00, 1),
#         ("dst/zoo-polyviz-100"    , 0.00, 1)] 
#
#input_classifier = [("dst/zoo-tree-100" , 0.00, 1),
#                    ("dst/zoo-bayes-192", 0.00, 1),
#                    ("dst/zoo-knn-192"  , 0.00, 1)]
##############################################################################
# ZOO: READ LIMIT 500
##############################################################################
#input = [(vr.SCATTERPLOT      , "scatterplot", 0.00, 1),
#         (vr.LINEAR_PROJECTION, "linproj"    , 0.00, 2),
#         (vr.RADVIZ           , "radviz"     , 0.00, 4),
#         (vr.POLYVIZ          , "polyviz"    , 0.00, 4)] 
#input_classifier = [("tree" , 0.00, 2),
#                    ("bayes", 0.00, 4),
#                    ("knn"  , 0.00, 2)]
#input = [("dst/zoo-scatterplot-120", 0.00, 1),
#         ("dst/zoo-linproj-453"    , 0.00, 2),
#         ("dst/zoo-radviz-500"     , 0.00, 4),
#         ("dst/zoo-polyviz-500"    , 0.00, 4)] 
#input_classifier = [("dst/zoo-tree-500" , 0.00, 2),
#                    ("dst/zoo-bayes-573", 0.00, 4),
#                    ("dst/zoo-knn-573"  , 0.00, 2)]
##############################################################################
# PRIMARY: READ LIMIT 500
##############################################################################
#input = [(ROOT + "dst/primary-scatterplot-153", 0.00, 1),
#         (ROOT + "dst/primary-linproj-377"    , 0.00, 2),
#         (ROOT + "dst/primary-radviz-500"     , 0.00, 4),
#         (ROOT + "dst/primary-polyviz-500"    , 0.00, 4)]
#input_classifier = [("dst/primary-tree-500" , 0.00, 2),
#                    ("bayes", 0.00, 2),
#                    ("knn"  , 0.00, 2)]
##############################################################################

def getProjections(input, attributes=None):
    vizrs = []
    smxs = []
    nets = []    
    projs_points = []
    methods = []
    uuid_results = {}
    for inp in input:
        if type(inp[0]) == type(""):
            fn, ratio, knn = inp
            print "READING: %s" % fn            
            pkl_file = open(fn + ".res", "rb")
            res = pickle.load(pkl_file)
            pkl_file.close()
            matrix, labels, data = OWDistanceFile.readMatrix(fn + ".dst")
            matrix.items = orange.ExampleTable(fn + ".tab")
            net = matrix2network(matrix, ratio, knn)
            net.items = matrix.items
            vizrs.append([res[ex['uuid'].value][1] for ex in net.items])
            smxs.append(matrix)
            nets.append(net)
            projs_points.append([res[ex['uuid'].value][2] for ex in net.items])
            methods.append([res[ex['uuid'].value][0] for ex in net.items])
            uuid_results.update(res)
                
        else:
            method, name, ratio, knn = inp
            vizr, smx_knn_predict, proj_points = metamining(data_c, method, name, PROJECTION_LIMIT, OUT_FILE, attributes)
            net = matrix2network(smx_knn_predict, ratio, knn)
            net.items = smx_knn_predict.items
            vizrs.append(vizr.results)
            smxs.append(smx_knn_predict)
            nets.append(net)
            projs_points.append(proj_points)
            methods.append(method)
            attrs = [ex['attributes'].value.split(', ') for ex in net.items]
            uuid_results.update(dict(zip([ex["uuid"].value for ex in net.items], \
                                zip([method for ex in net.items], [r for r in vizr.results], \
                                    proj_points, [None for ex in net.items], attrs))))
          
    return vizrs, nets, smxs, projs_points, methods, uuid_results

def getClassifiers(input, attributes=[]):
    results = {}
    smxs = []
    nets = []
    for s, ratio, knn in input:
        classifiers = None
        if s.upper() == "TREE":
            print "TREE: calculating classifiers"
            method = 5
            tree = orngTree.TreeLearner(storeNodeClassifier = 1, storeContingencies=0, \
              storeDistributions=1, minExamples=5, storeExamples=1).instance()
            gini = orange.MeasureAttribute_gini()
            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
            tree.maxDepth = 4
            tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)
            forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
            forest = forestLearner(data_d)
            classifiers = forest.classifiers
            outFile = OUT_FILE + "-tree"
            
        if s.upper() == "BAYES":
            if len(attributes) == 0: continue
            print "BAYES: calculating classifiers"
            method = 6
            
            classifiers = []
            for atts in attributes:
                exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
                data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
                classifiers.append(orange.BayesLearner(data2))
            outFile = OUT_FILE + "-bayes"
            
        if s.upper() == "KNN":
            if len(attributes) == 0: continue
            print "kNN: calculating classifiers"
            method = 7
            
            classifiers = []
            for atts in attributes:
                exclude = [att for att in data_d.domain if att.name not in atts + [data_d.domain.classVar.name]]
                data2 = orange.Preprocessor_ignore(data_d, attributes = exclude)
                classifiers.append(orange.kNNLearner(data2, k=math.sqrt(len(data2))|1))
            outFile = OUT_FILE + "-knn"
            
        if classifiers:
            smx, RV = classifier2matrix(data_d, method, classifiers, outFile)
            methods, uuids, res, projections, classifiers, attrs = RV
            net = matrix2network(smx, ratio, knn)
            net.items = smx.items
            
            results.update(dict(zip(uuids, zip(methods, res, projections, classifiers, attrs))))
            smxs.append(smx)
            nets.append(net)

        if os.path.exists("%s.res" % s):
            print "READING: %s" % s
            res = pickle.load(open("%s.res" % s, 'rb'))
            smx, labels, data = OWDistanceFile.readMatrix("%s.dst" % s)
            smx.items = orange.ExampleTable("%s.tab" % s)
            net = matrix2network(smx, ratio, knn)
            net.items = smx.items
            
            results.update(res)
            smxs.append(smx)
            nets.append(net)
            
    return nets, smxs, results
    
def getClusters(nets, smxs):
    median_matrices = []
    median_indices = []
    median_csizes = []
    best_indices = []
    for net, smx in zip(nets, smxs):    
        median_matrix, medians, csizes, bests = cluster2matrix(net, smx)
        median_matrices.append(median_matrix)
        median_indices.append(medians)
        median_csizes.append(csizes)
        best_indices.append(bests)
    
    return median_matrices, median_indices, median_csizes, best_indices

out = getModelsExampleTable()

##############################################################################
# GET PROJECTIONS
##############################################################################
##############################################################################
# GENERATE ATTRIBUTES FROM PROJECTIONS
#attrs = [u.split(';') for u in list(set([';'.join(sorted(v[4])) for v in res.values()]))]

attributes = getRandomAttributeSubsets(data_d.domain, LIMIT)
attributes = attributes + [[var.name for var in data_d.domain if var != data_d.domain.classVar]]

attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-436.tab') if ex['model'].value != 'SCATTERPLOT']
attributes = set(attributes)
attributes = [attr.split(', ') for attr in attributes]

vizrs, nets, smxs, projs_points, methods, res = getProjections(input, attributes)

for smx in smxs:
    print smx.dim
    
merged_vizr_res = []
merged_proj_points = []
probabilities = []
results = []

for i, vizr in enumerate(vizrs):
    merged_vizr_res.extend(vizrs[i])
    merged_proj_points.extend(projs_points[i])
    results.extend([v[5].get("Results").results for v in vizrs[i]])
    ex = orange.Example(out.domain)
    for m in range(len(vizr)):
        ex["uuid"] = nets[i].items[m]["uuid"].value
        ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
        ex["CA"] = nets[i].items[m]["CA"].value
        #ex["AUC"] = nets[i].items[m]["AUC"].value
        #ex["CA by class"] = nets[i].items[m]["CA by class"].value
        #ex["cluster CA"] = best_indices[i][j]
        ex["attributes"] = nets[i].items[m]["attributes"].value
        ex["model"] = nets[i].items[m]["model"].value
        #ex["cluster size"] = median_csizes[i][j]
        out.append(ex)

uuid_results = res
#median_matrices, median_indices, median_csizes, best_indices = getClusters(nets, smxs)
#print "projection results:", len(res)
#merged_vizr_res = []
#merged_proj_points = []
#probabilities = []
#results = []
#models = []
#for i, medians in enumerate(median_indices):    
#    for j, m in enumerate(medians):
#        vizrs[i][m][5]['Method'] = methods[i]
#        models.append(methods[i])
#        ex = orange.Example(out.domain)
#        ex["uuid"] = nets[i].items[m]["uuid"].value
#        ex["number of attributes"] = nets[i].items[m]["number of attributes"].value
#        ex["CA"] = nets[i].items[m]["CA"].value
#        ex["AUC"] = nets[i].items[m]["AUC"].value
#        ex["CA by class"] = nets[i].items[m]["CA by class"].value
#        ex["cluster CA"] = best_indices[i][j]
#        ex["attributes"] = nets[i].items[m]["attributes"].value
#        ex["model"] = nets[i].items[m]["model"].value
#        ex["cluster size"] = median_csizes[i][j]
#        out.append(ex)
#        
#    merged_vizr_res.extend([vizrs[i][m] for m in medians])
#    merged_proj_points.extend([projs_points[i][m] for m in medians])
#    results.extend([vizrs[i][m][5].get("Results").results for m in medians])

##############################################################################
# GET CLASSIFIERS
##############################################################################
#attributes = attributes + [[var.name] for var in data_d.domain if var != data_d.domain.classVar]
#
#nets, smxs, uuid_results = getClassifiers(input_classifier, attributes)
#median_matrices, median_indices, median_csizes, best_indices = getClusters(nets, smxs)
#
#print "process classifiers"
#uuid_results.update(res)
#print "merged results:", len(uuid_results)
#for i in range(len(nets)):
#    items = nets[i].items
#    median_matrix = median_matrices[i]
#    medians = median_indices[i]
#    for j, m in enumerate(medians):
#        out_ex = orange.Example(out.domain)
#        out_ex["uuid"] = items[m]["uuid"].value
#        out_ex["number of attributes"] = items[m]["number of attributes"].value
#        out_ex["CA"] = items[m]["CA"].value
#        out_ex["AUC"] = items[m]["AUC"].value
#        out_ex["CA by class"] = items[m]["CA by class"].value
#        out_ex["cluster CA"] = best_indices[i][j]
#        out_ex["attributes"] = items[m]["attributes"].value
#        out_ex["model"] = items[m]["model"].value
#        out_ex["cluster size"] = median_csizes[i][j]
#        out.append(out_ex)
#        
#    results.extend([uuid_results[items[m]["uuid"].value][1] for m in medians])
#    models.extend([items[m]["model"].value for m in medians])
#
smx_class, smx_prob = models2matrix(results)

print "saving", '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim)
saveSymMatrix(smx_prob, '%s-allmodels-%d' % (OUT_FILE, smx_prob.dim), out)
out.save('%s-allmodels-%d.tab' % (OUT_FILE, smx_prob.dim))
nr = {}
for ex in out:
    uuid = ex['uuid'].value 
    if uuid in uuid_results:
        nr[uuid] = uuid_results[uuid]
    else:
        print uuid
pickle.dump(nr, open('%s-allmodels-%d.res' % (OUT_FILE, smx_prob.dim), "wb"))
pickle.dump(uuid_results, open('%s-allmodels-%d-all.res' % (OUT_FILE, smx_prob.dim), "wb"))