Source

orange-modelmaps / archive / build.py

import uuid
import pickle
import itertools
import scipy.stats

import Orange
import orngVizRank as vr

from tools import *
from operator import itemgetter

FOLDS = 10
MODEL_LIMIT = 500

#data_c = getData(ROOT + "tab/zoo-c.tab")
data_d = getData(ROOT + "tab/zoo.tab")

def build_model(learner, data, indices):
    probabilities = []
    instance_predictions = []
    instance_classes = []
    res = []
    # estimate class probabilities using CV
    for fold in range(FOLDS):
        learnset = data.selectref(indices, fold, negate=1)
        testset = data.selectref(indices, fold, negate=0)
        classifier = learner(learnset)
        tcn = 0
        for i in range(len(data)):
            if (indices[i] == fold):
                ex = Orange.data.Instance(testset[tcn])
                ex.setclass("?")

                cr = classifier(ex, Orange.core.GetBoth)
                if cr[0].isSpecial():
                    raise "Classifier %s returned unknown value" % (classifier.name)

                probabilities.append(numpy.array(list(cr[1])))
                instance_predictions.append(cr[0])
                instance_classes.append(testset[tcn].get_class())
                tcn += 1

    return {'method' : type(learner).__name__,
            'classifier' : learner(data),
            'probabilities' : probabilities,
            'XAnchors' : None,
            'YAnchors' : None,
            'attributes': [x.name for x in data.domain.attributes],
            'instance_predictions' : instance_predictions,
            'instance_classes' : instance_classes}

def build_projection_model(data, attributes, indices, visualizationMethod=vr.LINEAR_PROJECTION):
    method = "?"
    if visualizationMethod == vr.SCATTERPLOT:
        import orngScaleScatterPlotData
        graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
        method = "SCATTERPLOT"
    elif visualizationMethod == vr.RADVIZ:
        import orngScaleLinProjData
        graph = orngScaleLinProjData.orngScaleLinProjData()
        graph.normalizeExamples = 1
        method = "RADVIZ"
    elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
        import orngScaleLinProjData
        from orngLinProj import FreeViz
        graph = orngScaleLinProjData.orngScaleLinProjData()
        graph.normalizeExamples = 0
        method = "SPCA"
    elif visualizationMethod == vr.POLYVIZ:
        import orngScalePolyvizData
        graph = orngScalePolyvizData.orngScalePolyvizData()
        graph.normalizeExamples = 1
        method = "POLYVIZ"
    else:
        print "an invalid visualization method was specified. VizRank can not run."
        return

    graph.setData(data, graph.rawSubsetData)
    attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
    domain = Orange.data.Domain([orange.FloatVariable("xVar"), orange.FloatVariable("yVar"), orange.EnumVariable(graph.dataDomain.classVar.name, values=getVariableValuesSorted(graph.dataDomain.classVar))])
    classListFull = graph.originalData[graph.dataClassIndex]
    table = None

    if visualizationMethod == vr.LINEAR_PROJECTION:
        freeviz = FreeViz(graph)
        projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
        if projections != None:
            XAnchors, YAnchors, (attrNames, newIndices) = projections
            table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
        else:
            print 'a null projection found'
    elif visualizationMethod == vr.SCATTERPLOT:
        XAnchors = YAnchors = None
        table = graph.createProjectionAsExampleTable(attrIndices)
    else:
        XAnchors = graph.createXAnchors(len(attrIndices))
        YAnchors = graph.createYAnchors(len(attrIndices))
        validData = graph.getValidList(attrIndices)
        # more than min number of examples
        if numpy.sum(validData) >= 10:
            classList = numpy.compress(validData, classListFull)
            selectedData = numpy.compress(validData, numpy.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
            sum_i = graph._getSum_i(selectedData)
            table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)

    if not table: return None

    probabilities = []
    instance_predictions = []
    instance_classes = []
    learner = orange.kNNLearner(k=10, rankWeight=0, distanceConstructor=orange.ExamplesDistanceConstructor_Euclidean(normalize=0))
    for fold in range(FOLDS):
        learnset = table.selectref(indices, fold, negate=1)
        testset = table.selectref(indices, fold, negate=0)
        classifier = learner(learnset)
        tcn = 0
        for i in range(len(data)):
            if (indices[i] == fold):
                ex = Orange.data.Instance(testset[tcn])
                ex.setclass("?")

                cr = classifier(ex, Orange.core.GetBoth)
                if cr[0].isSpecial():
                    raise "Classifier %s returned unknown value" % (classifier.name)
                probabilities.append(numpy.array(list(cr[1])))
                instance_predictions.append(cr[0])
                instance_classes.append(testset[tcn].get_class())
                tcn += 1

    classifier = learner(table)
    return {'method' : method,
            'classifier' : classifier,
            'probabilities' : probabilities,
            'XAnchors' : XAnchors,
            'YAnchors' : YAnchors,
            'attributes': attributes,
            'instance_predictions' : instance_predictions,
            'instance_classes' : instance_classes}

def build_rf_models(data):
    probabilities = [[] for fold in FOLDS]

    # estimate class probabilities using CV
    for fold in range(FOLDS):
        learnset = data.selectref(indices, fold, negate=1)
        testset = data.selectref(indices, fold, negate=0)

        tree = Orange.classification.tree.TreeLearner(storeNodeClassifier=1,
                   storeContingencies=0, storeDistributions=1, minExamples=5,
                   storeExamples=1).instance()
        gini = Orange.feature.scoring.Gini()
        tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
        tree.maxDepth = 4
        tree.split = Orange.ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
        forestLearner = Orange.ensemble.forest.RandomForestLearner(learner=tree, trees=MODEL_LIMIT)
        forestClassifier = forestLearner(learnset)

        for classifier in forestClassifier.classifiers:
            tcn = 0
            for i in range(len(data)):
                if (indices[i] == fold):
                    ex = Orange.data.Instance(testset[tcn])
                    ex.setclass("?")
                    tcn += 1
                    cr = classifier(ex, Orange.core.GetBoth)
                    if cr[0].isSpecial():
                        raise "Classifier %s returned unknown value" % (classifier.name)
                    probabilities.append(cr)
    model_classifier = learner(data)
    model_classifier.probabilities = probabilities

def get_learner(type, data):
    learner = None
    #if type.upper() == "TREE":
    #learner = orange.BayesLearner()
    #learner = orange.kNNLearner(k=int(math.sqrt(len(data))))

    return learner

def _print_time(time_start, iter, numiter):
    if iter % 10000 == 0:
        time_elapsed = time.time() - time_start
        time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
        time_remainng = int(time_total - time_elapsed)
        print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60

def models2matrix(models):
    dim = len(models)
    print "%d models to matrix -- rank" % dim

    #smx_prob = numpy.zeros(shape=(dim, dim))
    #smx_class = numpy.zeros(shape=(dim, dim))
    smx_rank = numpy.zeros(shape=(dim, dim))
    #smx_rank_None = numpy.zeros(shape=(dim, dim))
    ninstances = len(models[0]['probabilities'])
    normalization_factor = 2 * ninstances

    counter = 0
    time_start = time.time()
    instance_predictions = [numpy.array([pred.value for pred in model['instance_predictions']]) for model in models]
    #model_probs = [model['probabilities'] for model in models]
    for i in range(dim):
        for j in range(i):
            w = numpy.average(instance_predictions[i] !=
                                           instance_predictions[j])

            #w = sum([numpy.sum(numpy.power(p1 - p2, 2)) for \
            #            (p1, p2) in zip(model_probs[i], 
            #               model_probs[j])]) / normalization_factor

            #smx_rank[i,j] = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
            #            (p1, p2) in zip(models[i]['probabilities'], 
            #               models[j]['probabilities'])]) / ninstances)

            #smx_rank_0[i,j] = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
            #smx_rank_1[i,j] = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
            #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
            smx_rank[i, j] = 1 if math.isnan(w) else w

            counter += 1
            _print_time(time_start, counter, dim)
    #return smx_prob, smx_class, smx_rank
    return smx_rank

def save_models(models, smx, fn):
    print 'saving matrix'
    if type(smx) == type([]):
        for s, title in smx:
            numpy.save('%s-%s' % (fn, title), s)
    else:
        numpy.save('%s' % (fn), smx)

    print 'build out data'
    out = getModelsExampleTable()
    uuids = []
    for model in models:
        ex = Orange.data.Instance(out.domain)
        _uuid = uuid.uuid4().hex
        uuids.append(_uuid)
        ex['uuid'] = _uuid
        ex['number of attributes'] = len(model['attributes'])
        results = [p == c for p, c in zip(model['instance_predictions'], model['instance_classes'])]
        ex['CA'] = sum(results) / float(len(results))
        ex['model'] = model['method']
        ex['attributes'] = ', '.join(model['attributes'])
        #ex["AUC"] = nets[i].items[m]["AUC"].value
        resultsByClass = sorted([(p == c, c) for p, c in zip(model['instance_predictions'], model['instance_classes'])], key=itemgetter(1))
        groups = []
        for _k, g in itertools.groupby(resultsByClass, lambda x: x[1].value):
            resultsByClass, _classes = zip(*g)
            groups.append(resultsByClass)
        ex["CA by class"] = ', '.join([str(sum(results) / float(len(results))) for results in groups])
        #ex["cluster CA"] = best_indices[i][j]
        #ex["cluster size"] = median_csizes[i][j]
        ex["label"] = model['method']
        out.append(ex)

    print 'saving out data'
    out.save('%s.tab' % (fn))
    print 'saving models'
    pickle.dump(dict(zip(uuids, models)), open('%s.res' % (fn), "wb"))


#indices = Orange.core.MakeRandomIndicesCV(data_d, FOLDS, randseed=0, stratified=Orange.core.MakeRandomIndices.StratifiedIfPossible)
##
#attributes  = getRandomAttributeSubsets(data_d.domain, MODEL_LIMIT)
#attributes += [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
##
##attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-420.tab') if ex['model'].value != 'SCATTERPLOT']
##attributes = set(attributes)
##attributes = [attr.split(', ') for attr in attributes]
##
##
#models = []
#scatterplot_attributes = []
#for i in range(len(data_d.domain.attributes)):
#    for j in range(i):
#        scatterplot_attributes.append([data_d.domain.attributes[i].name, data_d.domain.attributes[j].name])
#        
##random.shuffle(scatterplot_attributes)
#models.extend([build_projection_model(data_d, attrs, indices, vr.SCATTERPLOT) for attrs in scatterplot_attributes])
#
#for projection_type in [vr.LINEAR_PROJECTION, vr.RADVIZ, vr.POLYVIZ]:
#    models.extend([build_projection_model(data_d, attrs, indices, projection_type) for attrs in attributes])
#
#models = [model for model in models if model is not None]
#smx_prob, smx_class, smx_rank = models2matrix(models)
#
#save_models(models, [(smx_prob, 'prob'), (smx_class, 'class'), (smx_rank, 'rank')], '%s-%d' % (OUT_FILE, len(smx_prob)))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.