Source

orange-modelmaps / archive / forest2matrix.py

import time
import uuid
import pickle
import numpy

import orange
import orngTree
import orngEnsemble
import orngTest
import orngStat

from tools import *

root = "C:\\Users\\miha\\Projects\\res\\metamining\\"
#root = "/home/miha/metamining/"
out_file = 'dst/primary'
method = 5
data = orange.ExampleTable(root + 'tab/primary-c.tab')
TREE_LIMIT = 15

tree = orngTree.TreeLearner(storeNodeClassifier = 0, storeContingencies=0, \
  storeDistributions=1, minExamples=5, storeExamples=1).instance()
gini = orange.MeasureAttribute_gini()
tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
tree.maxDepth = 4
tree.split = orngEnsemble.SplitConstructor_AttributeSubset(tree.split, 3)

forestLearner = orngEnsemble.RandomForestLearner(learner=tree, trees=TREE_LIMIT)
forest = forestLearner(data)

results = [orngTest.testOnData([c], data) for c in forest.classifiers]

out = getModelsExampleTable()

def getAttributes(node):
    atts = []
    if node.branchSelector:
        a = node.branchSelector.classVar.name
        atts.append(a)
        for i in range(len(node.branches)):
            if node.branches[i]:
                atts.extend(getAttributes(node.branches[i]))
    return atts

model_classprobs = []
model_predictprobs = []
for i, result in enumerate(results):
    model_classprobs.append(numpy.array([res.probabilities[0][res.actualClass] for res in result.results]))
    model_predictprobs.append([numpy.array(res.probabilities[0]) for res in result.results])
    attributes = list(set(getAttributes(forest.classifiers[i].tree)))
    ex = orange.Example(out.domain)
    ex['uuid'] = uuid.uuid4().hex
    ex['model'] = MODEL_LIST[method]
    ex['attributes'] = ", ".join(attributes)
    ex['number of attributes'] = len(attributes)
    ex['score'] = orngStat.CA(result)[0]
    out.append(ex)
    
##########################################################################
## calculate projection distance matrices
print 'calculating projection distance matrices,', len(model_classprobs), 'models'
dim = len(model_classprobs)
smx_class = orange.SymMatrix(dim)
smx_probs = orange.SymMatrix(dim)

counter = 0
time_start = time.time()

for i in range(dim):
    for j in range(i+1, dim):
        smx_class[i,j] = numpy.sum(numpy.power(model_classprobs[i] - model_classprobs[j], 2))
        # sum(sum_i(pi_1^i - pi_2^i)^2) - predictions probability squared error
        smx_probs[i,j] = sum([numpy.sum(numpy.power(p1 - p2, 2)) for (p1, p2) in zip(model_predictprobs[i],model_predictprobs[j])])
                                     
        counter += 1
        if counter % 500 == 0:
            time_elapsed = time.time() - time_start
            time_total = time_elapsed / counter * dim * (dim - 1) / 2
            time_remainng = int(time_total - time_elapsed)
            print counter, '/', dim * (dim - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
    
saveSymMatrix(smx_class, root + out_file + "-" + "tree" + '-' + str(dim) + '-class', out)
saveSymMatrix(smx_probs, root + out_file + "-" + "tree" + '-' + str(dim) + '-prob', out)
out.save(root + out_file + "-" + "tree" + '-' + str(dim) + '.tab')
output = open(root + out_file + "-" + "tree" + '-' + str(dim) + '.res', 'wb')
pickle.dump((method, [ex["uuid"].value for ex in out], results, None, forest.classifiers), output)
output.close()