Source

orange-modelmaps / archive / tools.py

Full commit
import math
import os.path
import pickle
import random
import time

import numpy
import orange

from orngScaleData import getVariableValuesSorted
from OWDistanceFile import readMatrix

MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM"]

ROOT = "/home/miha/work/res/metamining/"
#OUT_FILE = ROOT + "dst/zoo"
#OUT_FILE = ROOT + "dst/zoo"
OUT_FILE = ROOT + "_astra_/fprdk"

def saveSymMatrix(matrix, file, items=None, saveItems=False):
    fn = open(file + ".dst", 'w')
    fn.write("%d labeled\n" % matrix.dim)
    items = items if items else matrix.items
    for i in range(matrix.dim):
        fn.write("%s" % items[i]['attributes'])
        for j in range(i+1):
            fn.write("\t%.6f" % matrix[i,j])
        fn.write("\n")
        
    fn.close()
    if saveItems:
        items.save(file + ".tab")

def getModelsExampleTable():
    attrs = []
    attrs.append(orange.StringVariable("uuid"))
    varAttrs = orange.FloatVariable("number of attributes")
    varAttrs.numberOfDecimals = 0
    attrs.append(varAttrs)
    attrs.append(orange.FloatVariable("CA"))
    attrs.append(orange.FloatVariable("AUC"))
    attrs.append(orange.StringVariable("CA by class"))
    attrs.append(orange.FloatVariable("cluster CA"))
    attrs.append(orange.StringVariable("label"))
    attrs.append(orange.StringVariable("attributes"))
    attrs.append(orange.EnumVariable("model", values=MODEL_LIST[1:]))
    csizes = orange.FloatVariable("cluster size")
    csizes.numberOfDecimals = 0
    attrs.append(csizes)
    return orange.ExampleTable(orange.Domain(attrs, 0))

def loadModel(fn):
    if os.path.exists('%s.npy' % fn):
        matrix, _labels, data = readMatrix('%s.npy' % fn)
    elif os.path.exists("%s-prob.dst" % fn):
        matrix, _labels, data = readMatrix("%s-prob.dst" % fn)
    elif os.path.exists("%s.dst" % fn):
        matrix, _labels, data = readMatrix("%s.dst" % fn)
    else:
        return None
    
    if os.path.exists("%s.tab" % fn):
        data = orange.ExampleTable("%s.tab" % fn)
        matrix.items = data
    else:
        print "ExampleTable %s not found!\n" % ("%s.tab" % fn)
    if os.path.exists("%s.res" % fn):
        matrix.results = pickle.load(open("%s.res" % fn, 'rb'))
    else:
        print "Results pickle %s not found!\n" % ("%s.res" % fn)
    
    return matrix

def saveModel(smx, fn):
    saveSymMatrix(smx, "%s" % fn, smx.items)
    smx.items.save('%s.tab' % fn)
    pickle.dump(smx.results, open('%s.res' % fn, "wb"))


def getData(dataFile, continuize=0):
    if continuize:
        data = orange.ExampleTable(dataFile)
    
        ##############################################################################
        ## preprocess Data set
        transformer = orange.DomainContinuizer()
        transformer.multinomialTreatment = orange.DomainContinuizer.NValues
        transformer.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan
        transformer.classTreatment = orange.DomainContinuizer.Ignore
        data = data.translate(transformer(data))
        impavg = orange.ImputerConstructor_average(data)
        data = impavg(data)
        #data.save(root_out + 'tab/primary-continuized.tab')
        ##not enough examples remain
        ##data = orange.Preprocessor_dropMissing(data)
        return data
    else:
        return orange.ExampleTable(dataFile)
    
def evaluateProjections(vizr, attributeList):
    vizr.evaluatedProjectionsCount = 0
    vizr.optimizedProjectionsCount = 0
    vizr.evaluationData = {}            # clear all previous data about tested permutations and stuff
    vizr.evaluationData["triedCombinations"] = {}
    vizr.clearResults()

    vizr.clearArguments()
    
    if vizr.projOptimizationMethod != 0:
        vizr.freeviz.useGeneralizedEigenvectors = 1
        vizr.graph.normalizeExamples = 0
        
    domain = orange.Domain([orange.FloatVariable("xVar"), orange.FloatVariable("yVar"), orange.EnumVariable(vizr.graph.dataDomain.classVar.name, values = getVariableValuesSorted(vizr.graph.dataDomain.classVar))])
    classListFull = vizr.graph.originalData[vizr.graph.dataClassIndex]
    
    for attributes in attributeList:
        attrIndices = [vizr.graph.attributeNameIndex[attr] for attr in attributes]
        #print attrIndices
        if vizr.projOptimizationMethod != 0:
            projections = vizr.freeviz.findProjection(vizr.projOptimizationMethod, attrIndices, setAnchors = 0, percentDataUsed = vizr.percentDataUsed)
            if projections != None:
                xanchors, yanchors, (attrNames, newIndices) = projections
                table = vizr.graph.createProjectionAsExampleTable(newIndices, domain = domain, XAnchors = xanchors, YAnchors = yanchors)
                
            if table == None or len(table) < vizr.minNumOfExamples: continue
            accuracy, other_results = vizr.evaluateProjection(table)
            generalDict = {"XAnchors": list(xanchors), "YAnchors": list(yanchors), "Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {"XAnchors": list(xanchors), "YAnchors": list(yanchors)}
            vizr.addResult(accuracy, other_results, len(table), attrNames, vizr.evaluatedProjectionsCount, generalDict = generalDict)
            vizr.evaluatedProjectionsCount += 1
        else:
            XAnchors = vizr.graph.createXAnchors(len(attrIndices))
            YAnchors = vizr.graph.createYAnchors(len(attrIndices))
            validData = vizr.graph.getValidList(attrIndices)
            if numpy.sum(validData) >= vizr.minNumOfExamples:
                classList = numpy.compress(validData, classListFull)
                selectedData = numpy.compress(validData, numpy.take(vizr.graph.noJitteringScaledData, attrIndices, axis = 0), axis = 1)
                sum_i = vizr.graph._getSum_i(selectedData)
                
                table = vizr.graph.createProjectionAsExampleTable(attrIndices, validData = validData, classList = classList, sum_i = sum_i, XAnchors = XAnchors, YAnchors = YAnchors, domain = domain)
                accuracy, other_results = vizr.evaluateProjection(table)
                generalDict = {"Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {}
                vizr.addResult(accuracy, other_results, len(table), [vizr.graph.attributeNames[i] for i in attrIndices], vizr.evaluatedProjectionsCount, generalDict)
                vizr.evaluatedProjectionsCount += 1

    return vizr.evaluatedProjectionsCount
    
    
##############################################################################
# GENERATE RANDOM ATTRIBUTES
def getRandomAttributeSubsets(domain, nAttributes):
    def binomial(n, k):
        if n > k:
            return math.factorial(n) / (math.factorial(k) * math.factorial(n-k))
        elif n == k:
            return 1
        else:
            return 0
    print "nAttributes", nAttributes
    attrs = [var.name for var in domain if var != domain.classVar]
    nattrs = len(attrs)
    total = 0
    for i in range(0, nattrs):
        total += binomial(nattrs, i+1)
    
    def getSelection(attrs, total):
        attrs = list(attrs) 
        nattrs = len(attrs)
        attrs_left = nattrs
        select = random.randint(nattrs, total-2)
        total = 0
        attr_selection = []
        for i in range(0, nattrs):
            index = random.randint(0, attrs_left - 1)
            attr_selection.append(attrs[index])
            del attrs[index]
            attrs_left -= 1
            total += binomial(nattrs, i+1)
            if select < total:
                break
        
        return attr_selection
        
    attrs_set = set()
    attrs_list = []
    while (len(attrs_list) < nAttributes):
        attr = getSelection(attrs, total)
        attr_str = ';'.join(sorted(attr))
        if not attr_str in attrs_set:
            attrs_set.add(attr_str)
            attrs_list.append(sorted(attr))
    
    return attrs_list

def models2matrix(results):
    dim = len(results)
    print "%d models to matrix" % dim
    
    model_classprobs = []
    model_predictprobs = []

    for result in results:
        model_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in result]))
        model_predictprobs.append([numpy.array(res.probabilities[0]) for res in result])
    
    smx_class = orange.SymMatrix(dim)
    smx_prob = orange.SymMatrix(dim)
    
    counter = 0
    time_start = time.time()
    for i in range(dim):
        for j in range(i+1, dim):
            smx_class[i,j] = numpy.sum(numpy.power(model_classprobs[i] - model_classprobs[j], 2))
            # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
            smx_prob[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(model_predictprobs[i],model_predictprobs[j])])
                                         
            counter += 1
            if counter % 5000 == 0:
                time_elapsed = time.time() - time_start
                time_total = time_elapsed / counter * dim * (dim - 1) / 2
                time_remainng = int(time_total - time_elapsed)
                print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
    
    return smx_class, smx_prob