Commits

Miha Stajdohar  committed 59b5243

Changed folder structure.

  • Participants
  • Parent commits fcd2311

Comments (0)

Files changed (9)

File README

-Add mm to the PYTHONPATH for examples to work.
+Add mm to the PYTHONPATH for examples to work.

File _modelmaps/__init__.py

+import math
+import os.path
+import pickle
+import random
+import time
+
+import numpy as np
+
+from orngScaleData import getVariableValuesSorted
+from OWDistanceFile import readMatrix
+
+from Orange import data, feature
+
+from model import *
+from modelmap import *
+
+ROOT = "/home/miha/work/res/metamining/"
+#OUT_FILE = ROOT + "dst/zoo"
+#OUT_FILE = ROOT + "dst/zoo"
+OUT_FILE = ROOT + "_astra_/fprdk"
+
+def saveSymMatrix(matrix, file, items=None, saveItems=False):
+    fn = open(file + ".dst", 'w')
+    fn.write("%d labeled\n" % matrix.dim)
+    items = items if items else matrix.items
+    for i in range(matrix.dim):
+        fn.write("%s" % items[i]['attributes'])
+        for j in range(i + 1):
+            fn.write("\t%.6f" % matrix[i, j])
+        fn.write("\n")
+
+    fn.close()
+    if saveItems:
+        items.save(file + ".tab")
+
+
+
+def loadModel(fn):
+    if os.path.exists('%s.npy' % fn):
+        matrix, _labels, data = readMatrix('%s.npy' % fn)
+    elif os.path.exists("%s-prob.dst" % fn):
+        matrix, _labels, data = readMatrix("%s-prob.dst" % fn)
+    elif os.path.exists("%s.dst" % fn):
+        matrix, _labels, data = readMatrix("%s.dst" % fn)
+    else:
+        return None
+
+    if os.path.exists("%s.tab" % fn):
+        data = data.Table("%s.tab" % fn)
+        matrix.items = data
+    else:
+        print "ExampleTable %s not found!\n" % ("%s.tab" % fn)
+    if os.path.exists("%s.res" % fn):
+        matrix.results = pickle.load(open("%s.res" % fn, 'rb'))
+    else:
+        print "Results pickle %s not found!\n" % ("%s.res" % fn)
+
+    return matrix
+
+def saveModel(smx, fn):
+    saveSymMatrix(smx, "%s" % fn, smx.items)
+    smx.items.save('%s.tab' % fn)
+    pickle.dump(smx.results, open('%s.res' % fn, "wb"))
+
+
+
+def evaluateProjections(vizr, attributeList):
+    vizr.evaluatedProjectionsCount = 0
+    vizr.optimizedProjectionsCount = 0
+    vizr.evaluationData = {}            # clear all previous data about tested permutations and stuff
+    vizr.evaluationData["triedCombinations"] = {}
+    vizr.clearResults()
+
+    vizr.clearArguments()
+
+    if vizr.projOptimizationMethod != 0:
+        vizr.freeviz.useGeneralizedEigenvectors = 1
+        vizr.graph.normalizeExamples = 0
+
+    domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(vizr.graph.dataDomain.classVar.name, values=getVariableValuesSorted(vizr.graph.dataDomain.classVar))])
+    classListFull = vizr.graph.originalData[vizr.graph.dataClassIndex]
+
+    for attributes in attributeList:
+        attrIndices = [vizr.graph.attributeNameIndex[attr] for attr in attributes]
+        #print attrIndices
+        if vizr.projOptimizationMethod != 0:
+            projections = vizr.freeviz.findProjection(vizr.projOptimizationMethod, attrIndices, setAnchors=0, percentDataUsed=vizr.percentDataUsed)
+            if projections != None:
+                xanchors, yanchors, (attrNames, newIndices) = projections
+                table = vizr.graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=xanchors, YAnchors=yanchors)
+
+            if table == None or len(table) < vizr.minNumOfExamples: continue
+            accuracy, other_results = vizr.evaluateProjection(table)
+            generalDict = {"XAnchors": list(xanchors), "YAnchors": list(yanchors), "Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {"XAnchors": list(xanchors), "YAnchors": list(yanchors)}
+            vizr.addResult(accuracy, other_results, len(table), attrNames, vizr.evaluatedProjectionsCount, generalDict=generalDict)
+            vizr.evaluatedProjectionsCount += 1
+        else:
+            XAnchors = vizr.graph.createXAnchors(len(attrIndices))
+            YAnchors = vizr.graph.createYAnchors(len(attrIndices))
+            validData = vizr.graph.getValidList(attrIndices)
+            if numpy.sum(validData) >= vizr.minNumOfExamples:
+                classList = numpy.compress(validData, classListFull)
+                selectedData = numpy.compress(validData, numpy.take(vizr.graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+                sum_i = vizr.graph._getSum_i(selectedData)
+
+                table = vizr.graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+                accuracy, other_results = vizr.evaluateProjection(table)
+                generalDict = {"Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {}
+                vizr.addResult(accuracy, other_results, len(table), [vizr.graph.attributeNames[i] for i in attrIndices], vizr.evaluatedProjectionsCount, generalDict)
+                vizr.evaluatedProjectionsCount += 1
+
+    return vizr.evaluatedProjectionsCount

File _modelmaps/build.py

+import uuid
+import pickle
+import itertools
+import scipy.stats
+
+import Orange
+import orngVizRank as vr
+
+from tools import *
+from operator import itemgetter
+
+FOLDS = 10
+MODEL_LIMIT = 500
+
+#data_c = getData(ROOT + "tab/zoo-c.tab")
+data_d = getData(ROOT + "tab/zoo.tab")
+
+def build_model(learner, data, indices):
+    probabilities = []
+    instance_predictions = []
+    instance_classes = []
+    res = []
+    # estimate class probabilities using CV
+    for fold in range(FOLDS):
+        learnset = data.selectref(indices, fold, negate=1)
+        testset = data.selectref(indices, fold, negate=0)
+        classifier = learner(learnset)
+        tcn = 0
+        for i in range(len(data)):
+            if (indices[i] == fold):
+                ex = Orange.data.Instance(testset[tcn])
+                ex.setclass("?")
+
+                cr = classifier(ex, Orange.core.GetBoth)
+                if cr[0].isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+
+                probabilities.append(numpy.array(list(cr[1])))
+                instance_predictions.append(cr[0])
+                instance_classes.append(testset[tcn].get_class())
+                tcn += 1
+
+    return {'method' : type(learner).__name__,
+            'classifier' : learner(data),
+            'probabilities' : probabilities,
+            'XAnchors' : None,
+            'YAnchors' : None,
+            'attributes': [x.name for x in data.domain.attributes],
+            'instance_predictions' : instance_predictions,
+            'instance_classes' : instance_classes}
+
+def build_projection_model(data, attributes, indices, visualizationMethod=vr.LINEAR_PROJECTION):
+    method = "?"
+    if visualizationMethod == vr.SCATTERPLOT:
+        import orngScaleScatterPlotData
+        graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
+        method = "SCATTERPLOT"
+    elif visualizationMethod == vr.RADVIZ:
+        import orngScaleLinProjData
+        graph = orngScaleLinProjData.orngScaleLinProjData()
+        graph.normalizeExamples = 1
+        method = "RADVIZ"
+    elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
+        import orngScaleLinProjData
+        from orngLinProj import FreeViz
+        graph = orngScaleLinProjData.orngScaleLinProjData()
+        graph.normalizeExamples = 0
+        method = "SPCA"
+    elif visualizationMethod == vr.POLYVIZ:
+        import orngScalePolyvizData
+        graph = orngScalePolyvizData.orngScalePolyvizData()
+        graph.normalizeExamples = 1
+        method = "POLYVIZ"
+    else:
+        print "an invalid visualization method was specified. VizRank can not run."
+        return
+
+    graph.setData(data, graph.rawSubsetData)
+    attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
+    domain = Orange.data.Domain([orange.FloatVariable("xVar"), orange.FloatVariable("yVar"), orange.EnumVariable(graph.dataDomain.classVar.name, values=getVariableValuesSorted(graph.dataDomain.classVar))])
+    classListFull = graph.originalData[graph.dataClassIndex]
+    table = None
+
+    if visualizationMethod == vr.LINEAR_PROJECTION:
+        freeviz = FreeViz(graph)
+        projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
+        if projections != None:
+            XAnchors, YAnchors, (attrNames, newIndices) = projections
+            table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
+        else:
+            print 'a null projection found'
+    elif visualizationMethod == vr.SCATTERPLOT:
+        XAnchors = YAnchors = None
+        table = graph.createProjectionAsExampleTable(attrIndices)
+    else:
+        XAnchors = graph.createXAnchors(len(attrIndices))
+        YAnchors = graph.createYAnchors(len(attrIndices))
+        validData = graph.getValidList(attrIndices)
+        # more than min number of examples
+        if numpy.sum(validData) >= 10:
+            classList = numpy.compress(validData, classListFull)
+            selectedData = numpy.compress(validData, numpy.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+            sum_i = graph._getSum_i(selectedData)
+            table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+
+    if not table: return None
+
+    probabilities = []
+    instance_predictions = []
+    instance_classes = []
+    learner = orange.kNNLearner(k=10, rankWeight=0, distanceConstructor=orange.ExamplesDistanceConstructor_Euclidean(normalize=0))
+    for fold in range(FOLDS):
+        learnset = table.selectref(indices, fold, negate=1)
+        testset = table.selectref(indices, fold, negate=0)
+        classifier = learner(learnset)
+        tcn = 0
+        for i in range(len(data)):
+            if (indices[i] == fold):
+                ex = Orange.data.Instance(testset[tcn])
+                ex.setclass("?")
+
+                cr = classifier(ex, Orange.core.GetBoth)
+                if cr[0].isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+                probabilities.append(numpy.array(list(cr[1])))
+                instance_predictions.append(cr[0])
+                instance_classes.append(testset[tcn].get_class())
+                tcn += 1
+
+    classifier = learner(table)
+    return {'method' : method,
+            'classifier' : classifier,
+            'probabilities' : probabilities,
+            'XAnchors' : XAnchors,
+            'YAnchors' : YAnchors,
+            'attributes': attributes,
+            'instance_predictions' : instance_predictions,
+            'instance_classes' : instance_classes}
+
+def build_rf_models(data):
+    probabilities = [[] for fold in FOLDS]
+
+    # estimate class probabilities using CV
+    for fold in range(FOLDS):
+        learnset = data.selectref(indices, fold, negate=1)
+        testset = data.selectref(indices, fold, negate=0)
+
+        tree = Orange.classification.tree.TreeLearner(storeNodeClassifier=1,
+                   storeContingencies=0, storeDistributions=1, minExamples=5,
+                   storeExamples=1).instance()
+        gini = Orange.feature.scoring.Gini()
+        tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+        tree.maxDepth = 4
+        tree.split = Orange.ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
+        forestLearner = Orange.ensemble.forest.RandomForestLearner(learner=tree, trees=MODEL_LIMIT)
+        forestClassifier = forestLearner(learnset)
+
+        for classifier in forestClassifier.classifiers:
+            tcn = 0
+            for i in range(len(data)):
+                if (indices[i] == fold):
+                    ex = Orange.data.Instance(testset[tcn])
+                    ex.setclass("?")
+                    tcn += 1
+                    cr = classifier(ex, Orange.core.GetBoth)
+                    if cr[0].isSpecial():
+                        raise "Classifier %s returned unknown value" % (classifier.name)
+                    probabilities.append(cr)
+    model_classifier = learner(data)
+    model_classifier.probabilities = probabilities
+
+def get_learner(type, data):
+    learner = None
+    #if type.upper() == "TREE":
+    #learner = orange.BayesLearner()
+    #learner = orange.kNNLearner(k=int(math.sqrt(len(data))))
+
+    return learner
+
+def _print_time(time_start, iter, numiter):
+    if iter % 10000 == 0:
+        time_elapsed = time.time() - time_start
+        time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
+        time_remainng = int(time_total - time_elapsed)
+        print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+
+def models2matrix(models):
+    dim = len(models)
+    print "%d models to matrix -- rank" % dim
+
+    #smx_prob = numpy.zeros(shape=(dim, dim))
+    #smx_class = numpy.zeros(shape=(dim, dim))
+    smx_rank = numpy.zeros(shape=(dim, dim))
+    #smx_rank_None = numpy.zeros(shape=(dim, dim))
+    ninstances = len(models[0]['probabilities'])
+    normalization_factor = 2 * ninstances
+
+    counter = 0
+    time_start = time.time()
+    instance_predictions = [numpy.array([pred.value for pred in model['instance_predictions']]) for model in models]
+    #model_probs = [model['probabilities'] for model in models]
+    for i in range(dim):
+        for j in range(i):
+            w = numpy.average(instance_predictions[i] !=
+                                           instance_predictions[j])
+
+            #w = sum([numpy.sum(numpy.power(p1 - p2, 2)) for \
+            #            (p1, p2) in zip(model_probs[i], 
+            #               model_probs[j])]) / normalization_factor
+
+            #smx_rank[i,j] = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
+            #            (p1, p2) in zip(models[i]['probabilities'], 
+            #               models[j]['probabilities'])]) / ninstances)
+
+            #smx_rank_0[i,j] = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
+            #smx_rank_1[i,j] = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
+            #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
+            smx_rank[i, j] = 1 if math.isnan(w) else w
+
+            counter += 1
+            _print_time(time_start, counter, dim)
+    #return smx_prob, smx_class, smx_rank
+    return smx_rank
+
+def save_models(models, smx, fn):
+    print 'saving matrix'
+    if type(smx) == type([]):
+        for s, title in smx:
+            numpy.save('%s-%s' % (fn, title), s)
+    else:
+        numpy.save('%s' % (fn), smx)
+
+    print 'build out data'
+    out = getModelsExampleTable()
+    uuids = []
+    for model in models:
+        ex = Orange.data.Instance(out.domain)
+        _uuid = uuid.uuid4().hex
+        uuids.append(_uuid)
+        ex['uuid'] = _uuid
+        ex['number of attributes'] = len(model['attributes'])
+        results = [p == c for p, c in zip(model['instance_predictions'], model['instance_classes'])]
+        ex['CA'] = sum(results) / float(len(results))
+        ex['model'] = model['method']
+        ex['attributes'] = ', '.join(model['attributes'])
+        #ex["AUC"] = nets[i].items[m]["AUC"].value
+        resultsByClass = sorted([(p == c, c) for p, c in zip(model['instance_predictions'], model['instance_classes'])], key=itemgetter(1))
+        groups = []
+        for _k, g in itertools.groupby(resultsByClass, lambda x: x[1].value):
+            resultsByClass, _classes = zip(*g)
+            groups.append(resultsByClass)
+        ex["CA by class"] = ', '.join([str(sum(results) / float(len(results))) for results in groups])
+        #ex["cluster CA"] = best_indices[i][j]
+        #ex["cluster size"] = median_csizes[i][j]
+        ex["label"] = model['method']
+        out.append(ex)
+
+    print 'saving out data'
+    out.save('%s.tab' % (fn))
+    print 'saving models'
+    pickle.dump(dict(zip(uuids, models)), open('%s.res' % (fn), "wb"))
+
+
+#indices = Orange.core.MakeRandomIndicesCV(data_d, FOLDS, randseed=0, stratified=Orange.core.MakeRandomIndices.StratifiedIfPossible)
+##
+#attributes  = getRandomAttributeSubsets(data_d.domain, MODEL_LIMIT)
+#attributes += [[var.name for var in data_d.domain if var != data_d.domain.classVar]]
+##
+##attributes = [ex['attributes'].value for ex in orange.ExampleTable(ROOT + 'new\\zoo-420.tab') if ex['model'].value != 'SCATTERPLOT']
+##attributes = set(attributes)
+##attributes = [attr.split(', ') for attr in attributes]
+##
+##
+#models = []
+#scatterplot_attributes = []
+#for i in range(len(data_d.domain.attributes)):
+#    for j in range(i):
+#        scatterplot_attributes.append([data_d.domain.attributes[i].name, data_d.domain.attributes[j].name])
+#        
+##random.shuffle(scatterplot_attributes)
+#models.extend([build_projection_model(data_d, attrs, indices, vr.SCATTERPLOT) for attrs in scatterplot_attributes])
+#
+#for projection_type in [vr.LINEAR_PROJECTION, vr.RADVIZ, vr.POLYVIZ]:
+#    models.extend([build_projection_model(data_d, attrs, indices, projection_type) for attrs in attributes])
+#
+#models = [model for model in models if model is not None]
+#smx_prob, smx_class, smx_rank = models2matrix(models)
+#
+#save_models(models, [(smx_prob, 'prob'), (smx_class, 'class'), (smx_rank, 'rank')], '%s-%d' % (OUT_FILE, len(smx_prob)))

File _modelmaps/model.py

+"""
+.. index:: model
+
+*****
+Model
+*****
+
+.. autoclass:: mm.Model
+   :members:
+
+"""
+
+import uuid
+
+from itertools import groupby
+from operator import itemgetter
+
+from Orange import data
+
+class Model(object):
+
+    def __init__(self, type_, classifier, probabilities, attributes, \
+                 instance_predictions=None, instance_classes=None, \
+                 name=None, XAnchors=None, YAnchors=None):
+        """Meta-model, a node in Model Map.
+        
+        :param type_: model type; must be in MODEL_LIST
+        :type type_: string
+        
+        :param classifier: classifier object of this model
+        :type classifier: :obj:`Orange.classification.Classifier`
+        
+        :param probabilities: list of predicted probabilities (for all classes) 
+        :type probabilities: list of :obj:`numpy.ndarray`
+        
+        :param attributes: list of attribute names
+        :type attributes: list
+        
+        :param instance_predictions: array of predicted classes for all instances
+        :type instance_predictions: :obj:`numpy.ndarray`
+        
+        :param instance_classes: array of true classes for all instances
+        :type instance_classes: :obj:`numpy.ndarray`
+        
+        :param name: model name
+        :type name: string
+        
+        :param XAnchors: 
+        :type XAnchors: list 
+        
+        :param YAnchors: 
+        :type YAnchors: list
+        
+        """
+
+        self.uuid = uuid.uuid4().hex
+        self.type = type_
+        self.classifier = classifier
+        self.probabilities = probabilities
+        self.attributes = attributes
+        self.instance_predictions = instance_predictions
+        self.instance_classes = instance_classes
+        self.name = name if name is not None else self.type
+        self.XAnchors = XAnchors
+        self.YAnchors = YAnchors
+
+    def get_instance(self, domain):
+        """Return an :obj:`Orange.data.Table` instance with model meta-data.
+        
+        :param domain: instance will match given domain 
+        :type domain: :obj:`Orange.data.Domain`
+        """
+
+        inst = data.Instance(domain)
+
+        inst['uuid'] = self.uuid
+        inst['number of attributes'] = len(self.attributes)
+        results = [p == c for p, c in zip(self.instance_predictions, self.instance_classes)]
+        inst['CA'] = sum(results) / float(len(results))
+        inst['type'] = self.type
+        inst['model'] = self
+        inst['attributes'] = ', '.join(self.attributes)
+        #ex["AUC"] = nets[i].items[m]["AUC"].value
+        resultsByClass = sorted([(p == c, c) for p, c in zip(self.instance_predictions, self.instance_classes)], key=itemgetter(1))
+        groups = []
+        for _k, g in groupby(resultsByClass, lambda x: x[1]):
+            resultsByClass, _classes = zip(*g)
+            groups.append(resultsByClass)
+        inst["CA by class"] = ', '.join([str(sum(results) / float(len(results))) for results in groups])
+        #ex["cluster CA"] = best_indices[i][j]
+        #ex["cluster size"] = median_csizes[i][j]
+        inst["label"] = self.name
+
+        return inst

File _modelmaps/modelmap.py

+"""
+.. index:: model map
+
+***************
+Build Model Map
+***************
+
+.. autoclass:: mm.BuildModelMap
+   :members:
+   
+**************
+Help Functions
+**************
+
+"""
+
+import bz2, itertools, math, random, os.path, time, uuid
+import cPickle as pickle
+
+import scipy.stats
+import numpy as np
+
+import orngVizRank as vr
+
+from operator import itemgetter
+from orngScaleData import getVariableValuesSorted
+from model import Model
+
+from Orange import data, distance, feature, ensemble
+from Orange.classification.knn import kNNLearner
+from Orange.classification.tree import TreeLearner
+
+MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM"]
+
+def distance_class(m1, m2):
+    w = np.average(m1.instance_predictions != m2.instance_predictions)
+    return 1 if math.isnan(w) else w
+
+def distance_prob(m1, m2):
+    ninstances = len(m1.probabilities)
+    normalization_factor = 2 * ninstances
+
+    return sum([np.sum(np.power(p1 - p2, 2)) for \
+                        (p1, p2) in zip(m1.probabilities, \
+                           m2.probabilities)]) / normalization_factor
+
+def distance_rank(m1, m2):
+    ninstances = len(m1.probabilities)
+
+    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
+    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
+    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
+    w = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
+                        (p1, p2) in zip(m1.probabilities,
+                           m2.probabilities)]) / ninstances)
+    return w
+
+def get_feature_subsets_scatterplot(domain, nsubsets):
+    """Return attribute subsets for Scatter Plot."""
+    attrs = []
+    for i in range(len(domain.features)):
+        for j in range(i):
+            attrs.append((domain.features[i].name, domain.features[j].name))
+    random.shuffle(attrs)
+
+    if nsubsets > len(attrs):
+        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % len(attrs))
+
+    return attrs[:nsubsets]
+
+def get_feature_subsets(domain, nsubsets):
+    """Return random attribute subsets.
+    
+    :param domain: data set domain to extract features
+    :type domain: :obj:`Orange.data.Domain`
+    
+    :param nsubsets:  number of attribute subsets
+    :type nsubsets: int
+    """
+
+    def binomial(n, k):
+        if n > k:
+            return math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
+        elif n == k:
+            return 1
+        else:
+            return 0
+
+    attrs = [var.name for var in domain.features]
+    nattrs = len(attrs)
+    total = sum(binomial(nattrs, i) for i in range(2, nattrs))
+
+    if nsubsets > total:
+        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % total)
+
+    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(2, nattrs))))
+    selectors = [1] * nsubsets + [0] * (total - nsubsets)
+    random.shuffle(selectors)
+    return list(itertools.compress(combinations, selectors))
+
+def get_models_table():
+    """Return an empty data table for model meta data."""
+
+    attrs = []
+    attrs.append(feature.String("uuid"))
+    varAttrs = feature.Continuous("number of attributes")
+    varAttrs.numberOfDecimals = 0
+    attrs.append(varAttrs)
+    attrs.append(feature.Continuous("CA"))
+    attrs.append(feature.Continuous("AUC"))
+    attrs.append(feature.String("CA by class"))
+    attrs.append(feature.Continuous("cluster CA"))
+    attrs.append(feature.String("label"))
+    attrs.append(feature.String("attributes"))
+    attrs.append(feature.Discrete("type", values=MODEL_LIST[1:]))
+    attrs.append(feature.Python("model"))
+    csizes = feature.Continuous("cluster size")
+    csizes.numberOfDecimals = 0
+    attrs.append(csizes)
+
+    return data.Table(data.Domain(attrs, 0))
+
+class BuildModelMap(object):
+
+    def __init__(self, fname, folds=10, model_limit=500):
+        self.folds = folds
+        self.model_limit = model_limit
+        self.data_d = self.get_data(fname)
+        self.data_c = self.get_data(fname, continuize=True)
+        self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)
+
+    def get_data(self, fname, continuize=False):
+        """Return a data Table.
+           
+        :param fname: data set file name
+        :type fname: string
+        
+        :param continuize:  if true, it tries to load a name-c.tab data table as Orange DomainContinuizer changes attribute names.
+        :type continuize: bool
+        
+        """
+
+        if continuize:
+            base, ext = os.path.splitext(fname)
+            fname = "%s-c%s" % (base, ext)
+
+            table = data.Table(fname)
+            return table
+            ##############################################################################
+            ## preprocess Data set
+#            transformer = data.continuization.DomainContinuizer()
+#            transformer.multinomialTreatment = data.continuization.DomainContinuizer.NValues
+#            transformer.continuousTreatment = data.continuization.DomainContinuizer.NormalizeBySpan
+#            transformer.classTreatment = data.continuization.DomainContinuizer.Ignore
+#            table = table.translate(transformer(table))
+#            return feature.imputation.AverageConstructor(table)(table)
+        else:
+            return data.Table(fname)
+
+
+    def build_model(self, learner, data):
+        """Build a classification meta-model.
+        
+        :param learner: classification learner to wrap
+        :type learner: :obj:`Orange.classification.Learner`
+        
+        :param data: data set
+        :type data: :obj:`Orange.data.Table`
+        
+        """
+
+        probabilities = []
+        instance_predictions = []
+        instance_classes = []
+        res = []
+        # estimate class probabilities using CV
+        for fold in range(self.folds):
+            learnset = data.selectref(self.indices, fold, negate=1)
+            testset = data.selectref(self.indices, fold, negate=0)
+            classifier = learner(learnset)
+            tcn = 0
+            for i in range(len(data)):
+                if (self.indices[i] == fold):
+                    ex = data.Instance(testset[tcn])
+                    ex.setclass("?")
+
+                    cr = classifier(ex, classifier.GetBoth)
+                    if cr[0].isSpecial():
+                        raise "Classifier %s returned unknown value" % (classifier.name)
+
+                    probabilities.append(np.array(list(cr[1])))
+                    instance_predictions.append(cr[0])
+                    instance_classes.append(testset[tcn].get_class())
+                    tcn += 1
+
+        return Model(type(learner).__name__,
+                     learner(data),
+                     probabilities,
+                     [x.name for x in data.domain.attributes],
+                     instance_predictions,
+                     instance_classes)
+
+    def build_projection_model(self, attributes, visualizationMethod=vr.LINEAR_PROJECTION):
+        """Build a projection meta-model."""
+
+        method = "?"
+        if visualizationMethod == vr.SCATTERPLOT:
+            import orngScaleScatterPlotData
+            graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
+            method = "SCATTERPLOT"
+        elif visualizationMethod == vr.RADVIZ:
+            import orngScaleLinProjData
+            graph = orngScaleLinProjData.orngScaleLinProjData()
+            graph.normalizeExamples = 1
+            method = "RADVIZ"
+        elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
+            import orngScaleLinProjData
+            from orngLinProj import FreeViz
+            graph = orngScaleLinProjData.orngScaleLinProjData()
+            graph.normalizeExamples = 0
+            method = "SPCA"
+        elif visualizationMethod == vr.POLYVIZ:
+            import orngScalePolyvizData
+            graph = orngScalePolyvizData.orngScalePolyvizData()
+            graph.normalizeExamples = 1
+            method = "POLYVIZ"
+        else:
+            print "an invalid visualization method was specified. VizRank can not run."
+            return
+
+        graph.setData(self.data_c, graph.rawSubsetData)
+        attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
+        domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(graph.dataDomain.class_var.name, values=getVariableValuesSorted(graph.dataDomain.class_var))])
+        classListFull = graph.originalData[graph.dataClassIndex]
+        table = None
+
+        if visualizationMethod == vr.LINEAR_PROJECTION:
+            freeviz = FreeViz(graph)
+            projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
+            if projections != None:
+                XAnchors, YAnchors, (attrNames, newIndices) = projections
+                table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
+            else:
+                print 'a null projection found'
+        elif visualizationMethod == vr.SCATTERPLOT:
+            XAnchors = YAnchors = None
+            table = graph.createProjectionAsExampleTable(attrIndices)
+        else:
+            XAnchors = graph.createXAnchors(len(attrIndices))
+            YAnchors = graph.createYAnchors(len(attrIndices))
+            validData = graph.getValidList(attrIndices)
+            # more than min number of examples
+            if np.sum(validData) >= 10:
+                classList = np.compress(validData, classListFull)
+                selectedData = np.compress(validData, np.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+                sum_i = graph._getSum_i(selectedData)
+                table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+
+        if not table: return None
+
+        probabilities = []
+        instance_predictions = []
+        instance_classes = []
+        learner = kNNLearner(k=10, rankWeight=0, distanceConstructor=distance.Euclidean(normalize=0))
+        for fold in range(self.folds):
+            learnset = table.selectref(self.indices, fold, negate=1)
+            testset = table.selectref(self.indices, fold, negate=0)
+            classifier = learner(learnset)
+            tcn = 0
+            for i in range(len(table)):
+                if (self.indices[i] == fold):
+                    ex = data.Instance(testset[tcn])
+                    ex.setclass("?")
+
+                    cr = classifier(ex, classifier.GetBoth)
+                    if cr[0].isSpecial():
+                        raise "Classifier %s returned unknown value" % (classifier.name)
+                    probabilities.append(np.array(list(cr[1])))
+                    instance_predictions.append(cr[0])
+                    instance_classes.append(testset[tcn].get_class())
+                    tcn += 1
+
+        return Model(method,
+                     learner(table),
+                     probabilities,
+                     attributes,
+                     np.array([c.value for c in instance_predictions]),
+                     np.array([c.value for c in instance_classes]),
+                     XAnchors=XAnchors,
+                     YAnchors=YAnchors)
+
+    def build_rf_models(self, data):
+        probabilities = [[] for fold in self.folds]
+
+        # estimate class probabilities using CV
+        for fold in range(self.folds):
+            learnset = data.selectref(indices, fold, negate=1)
+            testset = data.selectref(indices, fold, negate=0)
+
+            tree = TreeLearner(storeNodeClassifier=1,
+                       storeContingencies=0, storeDistributions=1, minExamples=5,
+                       storeExamples=1).instance()
+            gini = feature.scoring.Gini()
+            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
+            tree.maxDepth = 4
+            tree.split = ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
+            forestLearner = ensemble.forest.RandomForestLearner(learner=tree, trees=self.model_limit)
+            forestClassifier = forestLearner(learnset)
+
+            for classifier in forestClassifier.classifiers:
+                tcn = 0
+                for i in range(len(data)):
+                    if (indices[i] == fold):
+                        ex = data.Instance(testset[tcn])
+                        ex.setclass("?")
+                        tcn += 1
+                        cr = classifier(ex, classifier.GetBoth)
+                        if cr[0].isSpecial():
+                            raise "Classifier %s returned unknown value" % (classifier.name)
+                        probabilities.append(cr)
+        model_classifier = learner(data)
+        model_classifier.probabilities = probabilities
+
+
+    def _print_time(self, time_start, iter, numiter):
+        if iter % 10000 == 0:
+            time_elapsed = time.time() - time_start
+            time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
+            time_remainng = int(time_total - time_elapsed)
+            print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+
+    def build_model_matrix(self, models, dist=distance_class):
+        """Build a distance matrix of models given the distance measure."""
+
+        dim = len(models)
+        print "%d models to matrix -- rank" % dim
+        smx = np.zeros(shape=(dim, dim))
+
+        counter = 0
+        time_start = time.time()
+        for i in range(dim):
+            for j in range(i):
+                smx[i, j] = dist(models[i], models[j])
+                counter += 1
+                self._print_time(time_start, counter, dim)
+
+        return smx
+
+    def build_model_data(self, models):
+        """Return an :obj:`Orange.data.Table` of model meta-data."""
+
+        table = get_models_table()
+        table.extend([model.get_instance(table.domain) for model in models])
+        return table
+
+    def save(self, fname, models=None, smx=None, table=None):
+        """Save model map to disk. Model similarity matrix and data table tuple 
+        is pickled and compressed as a bz2 archive.
+        
+        """
+
+        if models is None and (smx is None or table is None):
+            raise AttributeError("If models is none, smx and table must be given.")
+
+        if models is not None:
+            if type(models) != type([]):
+                raise AttributeError("Attribute models must be a list of models.")
+
+            if len(models) <= 0:
+                raise AttributeError("Attribute models is an empty list.")
+
+        if smx is None:
+            smx = self.build_model_matrix(models)
+
+        if table is None:
+            table = self.build_model_data(models)
+
+        pickle.dump((smx, table, self.data_d), bz2.BZ2File('%s.bz2' % fname, "w"), -1)
+
+    def load(self, fname):
+        """Load a model map. Read compressed tuple containing model similarity 
+        matrix and data table.
+        
+        """
+
+        smx, table, data = pickle.load(bz2.BZ2File('%s.bz2' % fname, "r"))
+        return smx, table, data

File mm/__init__.py

-import math
-import os.path
-import pickle
-import random
-import time
-
-import numpy as np
-
-from orngScaleData import getVariableValuesSorted
-from OWDistanceFile import readMatrix
-
-from Orange import data, feature
-
-from model import *
-from modelmap import *
-
-ROOT = "/home/miha/work/res/metamining/"
-#OUT_FILE = ROOT + "dst/zoo"
-#OUT_FILE = ROOT + "dst/zoo"
-OUT_FILE = ROOT + "_astra_/fprdk"
-
-def saveSymMatrix(matrix, file, items=None, saveItems=False):
-    fn = open(file + ".dst", 'w')
-    fn.write("%d labeled\n" % matrix.dim)
-    items = items if items else matrix.items
-    for i in range(matrix.dim):
-        fn.write("%s" % items[i]['attributes'])
-        for j in range(i + 1):
-            fn.write("\t%.6f" % matrix[i, j])
-        fn.write("\n")
-
-    fn.close()
-    if saveItems:
-        items.save(file + ".tab")
-
-
-
-def loadModel(fn):
-    if os.path.exists('%s.npy' % fn):
-        matrix, _labels, data = readMatrix('%s.npy' % fn)
-    elif os.path.exists("%s-prob.dst" % fn):
-        matrix, _labels, data = readMatrix("%s-prob.dst" % fn)
-    elif os.path.exists("%s.dst" % fn):
-        matrix, _labels, data = readMatrix("%s.dst" % fn)
-    else:
-        return None
-
-    if os.path.exists("%s.tab" % fn):
-        data = data.Table("%s.tab" % fn)
-        matrix.items = data
-    else:
-        print "ExampleTable %s not found!\n" % ("%s.tab" % fn)
-    if os.path.exists("%s.res" % fn):
-        matrix.results = pickle.load(open("%s.res" % fn, 'rb'))
-    else:
-        print "Results pickle %s not found!\n" % ("%s.res" % fn)
-
-    return matrix
-
-def saveModel(smx, fn):
-    saveSymMatrix(smx, "%s" % fn, smx.items)
-    smx.items.save('%s.tab' % fn)
-    pickle.dump(smx.results, open('%s.res' % fn, "wb"))
-
-
-
-def evaluateProjections(vizr, attributeList):
-    vizr.evaluatedProjectionsCount = 0
-    vizr.optimizedProjectionsCount = 0
-    vizr.evaluationData = {}            # clear all previous data about tested permutations and stuff
-    vizr.evaluationData["triedCombinations"] = {}
-    vizr.clearResults()
-
-    vizr.clearArguments()
-
-    if vizr.projOptimizationMethod != 0:
-        vizr.freeviz.useGeneralizedEigenvectors = 1
-        vizr.graph.normalizeExamples = 0
-
-    domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(vizr.graph.dataDomain.classVar.name, values=getVariableValuesSorted(vizr.graph.dataDomain.classVar))])
-    classListFull = vizr.graph.originalData[vizr.graph.dataClassIndex]
-
-    for attributes in attributeList:
-        attrIndices = [vizr.graph.attributeNameIndex[attr] for attr in attributes]
-        #print attrIndices
-        if vizr.projOptimizationMethod != 0:
-            projections = vizr.freeviz.findProjection(vizr.projOptimizationMethod, attrIndices, setAnchors=0, percentDataUsed=vizr.percentDataUsed)
-            if projections != None:
-                xanchors, yanchors, (attrNames, newIndices) = projections
-                table = vizr.graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=xanchors, YAnchors=yanchors)
-
-            if table == None or len(table) < vizr.minNumOfExamples: continue
-            accuracy, other_results = vizr.evaluateProjection(table)
-            generalDict = {"XAnchors": list(xanchors), "YAnchors": list(yanchors), "Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {"XAnchors": list(xanchors), "YAnchors": list(yanchors)}
-            vizr.addResult(accuracy, other_results, len(table), attrNames, vizr.evaluatedProjectionsCount, generalDict=generalDict)
-            vizr.evaluatedProjectionsCount += 1
-        else:
-            XAnchors = vizr.graph.createXAnchors(len(attrIndices))
-            YAnchors = vizr.graph.createYAnchors(len(attrIndices))
-            validData = vizr.graph.getValidList(attrIndices)
-            if numpy.sum(validData) >= vizr.minNumOfExamples:
-                classList = numpy.compress(validData, classListFull)
-                selectedData = numpy.compress(validData, numpy.take(vizr.graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
-                sum_i = vizr.graph._getSum_i(selectedData)
-
-                table = vizr.graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
-                accuracy, other_results = vizr.evaluateProjection(table)
-                generalDict = {"Results": vizr.evaluationResults} if vizr.saveEvaluationResults else {}
-                vizr.addResult(accuracy, other_results, len(table), [vizr.graph.attributeNames[i] for i in attrIndices], vizr.evaluatedProjectionsCount, generalDict)
-                vizr.evaluatedProjectionsCount += 1
-
-    return vizr.evaluatedProjectionsCount

File mm/model.py

-"""
-.. index:: model
-
-*****
-Model
-*****
-
-.. autoclass:: mm.Model
-   :members:
-
-"""
-
-import uuid
-
-from itertools import groupby
-from operator import itemgetter
-
-from Orange import data
-
-class Model(object):
-
-    def __init__(self, type_, classifier, probabilities, attributes, \
-                 instance_predictions=None, instance_classes=None, \
-                 name=None, XAnchors=None, YAnchors=None):
-        """Meta-model, a node in Model Map.
-        
-        :param type_: model type; must be in MODEL_LIST
-        :type type_: string
-        
-        :param classifier: classifier object of this model
-        :type classifier: :obj:`Orange.classification.Classifier`
-        
-        :param probabilities: list of predicted probabilities (for all classes) 
-        :type probabilities: list of :obj:`numpy.ndarray`
-        
-        :param attributes: list of attribute names
-        :type attributes: list
-        
-        :param instance_predictions: array of predicted classes for all instances
-        :type instance_predictions: :obj:`numpy.ndarray`
-        
-        :param instance_classes: array of true classes for all instances
-        :type instance_classes: :obj:`numpy.ndarray`
-        
-        :param name: model name
-        :type name: string
-        
-        :param XAnchors: 
-        :type XAnchors: list 
-        
-        :param YAnchors: 
-        :type YAnchors: list
-        
-        """
-
-        self.uuid = uuid.uuid4().hex
-        self.type = type_
-        self.classifier = classifier
-        self.probabilities = probabilities
-        self.attributes = attributes
-        self.instance_predictions = instance_predictions
-        self.instance_classes = instance_classes
-        self.name = name if name is not None else self.type
-        self.XAnchors = XAnchors
-        self.YAnchors = YAnchors
-
-    def get_instance(self, domain):
-        """Return an :obj:`Orange.data.Table` instance with model meta-data.
-        
-        :param domain: instance will match given domain 
-        :type domain: :obj:`Orange.data.Domain`
-        """
-
-        inst = data.Instance(domain)
-
-        inst['uuid'] = self.uuid
-        inst['number of attributes'] = len(self.attributes)
-        results = [p == c for p, c in zip(self.instance_predictions, self.instance_classes)]
-        inst['CA'] = sum(results) / float(len(results))
-        inst['type'] = self.type
-        inst['model'] = self
-        inst['attributes'] = ', '.join(self.attributes)
-        #ex["AUC"] = nets[i].items[m]["AUC"].value
-        resultsByClass = sorted([(p == c, c) for p, c in zip(self.instance_predictions, self.instance_classes)], key=itemgetter(1))
-        groups = []
-        for _k, g in groupby(resultsByClass, lambda x: x[1]):
-            resultsByClass, _classes = zip(*g)
-            groups.append(resultsByClass)
-        inst["CA by class"] = ', '.join([str(sum(results) / float(len(results))) for results in groups])
-        #ex["cluster CA"] = best_indices[i][j]
-        #ex["cluster size"] = median_csizes[i][j]
-        inst["label"] = self.name
-
-        return inst

File mm/modelmap.py

-"""
-.. index:: model map
-
-***************
-Build Model Map
-***************
-
-.. autoclass:: mm.BuildModelMap
-   :members:
-   
-**************
-Help Functions
-**************
-
-"""
-
-import bz2, itertools, math, random, os.path, time, uuid
-import cPickle as pickle
-
-import scipy.stats
-import numpy as np
-
-import orngVizRank as vr
-
-from operator import itemgetter
-from orngScaleData import getVariableValuesSorted
-from model import Model
-
-from Orange import data, distance, feature, ensemble
-from Orange.classification.knn import kNNLearner
-from Orange.classification.tree import TreeLearner
-
-MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM"]
-
-def distance_class(m1, m2):
-    w = np.average(m1.instance_predictions != m2.instance_predictions)
-    return 1 if math.isnan(w) else w
-
-def distance_prob(m1, m2):
-    ninstances = len(m1.probabilities)
-    normalization_factor = 2 * ninstances
-
-    return sum([np.sum(np.power(p1 - p2, 2)) for \
-                        (p1, p2) in zip(m1.probabilities, \
-                           m2.probabilities)]) / normalization_factor
-
-def distance_rank(m1, m2):
-    ninstances = len(m1.probabilities)
-
-    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
-    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
-    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
-    w = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
-                        (p1, p2) in zip(m1.probabilities,
-                           m2.probabilities)]) / ninstances)
-    return w
-
-def get_feature_subsets_scatterplot(domain, nsubsets):
-    """Return attribute subsets for Scatter Plot."""
-    attrs = []
-    for i in range(len(domain.features)):
-        for j in range(i):
-            attrs.append((domain.features[i].name, domain.features[j].name))
-    random.shuffle(attrs)
-
-    if nsubsets > len(attrs):
-        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % len(attrs))
-
-    return attrs[:nsubsets]
-
-def get_feature_subsets(domain, nsubsets):
-    """Return random attribute subsets.
-    
-    :param domain: data set domain to extract features
-    :type domain: :obj:`Orange.data.Domain`
-    
-    :param nsubsets:  number of attribute subsets
-    :type nsubsets: int
-    """
-
-    def binomial(n, k):
-        if n > k:
-            return math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
-        elif n == k:
-            return 1
-        else:
-            return 0
-
-    attrs = [var.name for var in domain.features]
-    nattrs = len(attrs)
-    total = sum(binomial(nattrs, i) for i in range(2, nattrs))
-
-    if nsubsets > total:
-        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % total)
-
-    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(2, nattrs))))
-    selectors = [1] * nsubsets + [0] * (total - nsubsets)
-    random.shuffle(selectors)
-    return list(itertools.compress(combinations, selectors))
-
-def get_models_table():
-    """Return an empty data table for model meta data."""
-
-    attrs = []
-    attrs.append(feature.String("uuid"))
-    varAttrs = feature.Continuous("number of attributes")
-    varAttrs.numberOfDecimals = 0
-    attrs.append(varAttrs)
-    attrs.append(feature.Continuous("CA"))
-    attrs.append(feature.Continuous("AUC"))
-    attrs.append(feature.String("CA by class"))
-    attrs.append(feature.Continuous("cluster CA"))
-    attrs.append(feature.String("label"))
-    attrs.append(feature.String("attributes"))
-    attrs.append(feature.Discrete("type", values=MODEL_LIST[1:]))
-    attrs.append(feature.Python("model"))
-    csizes = feature.Continuous("cluster size")
-    csizes.numberOfDecimals = 0
-    attrs.append(csizes)
-
-    return data.Table(data.Domain(attrs, 0))
-
-class BuildModelMap(object):
-
-    def __init__(self, fname, folds=10, model_limit=500):
-        self.folds = folds
-        self.model_limit = model_limit
-        self.data_d = self.get_data(fname)
-        self.data_c = self.get_data(fname, continuize=True)
-        self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)
-
-    def get_data(self, fname, continuize=False):
-        """Return a data Table.
-           
-        :param fname: data set file name
-        :type fname: string
-        
-        :param continuize:  if true, it tries to load a name-c.tab data table as Orange DomainContinuizer changes attribute names.
-        :type continuize: bool
-        
-        """
-
-        if continuize:
-            base, ext = os.path.splitext(fname)
-            fname = "%s-c%s" % (base, ext)
-
-            table = data.Table(fname)
-            return table
-            ##############################################################################
-            ## preprocess Data set
-#            transformer = data.continuization.DomainContinuizer()
-#            transformer.multinomialTreatment = data.continuization.DomainContinuizer.NValues
-#            transformer.continuousTreatment = data.continuization.DomainContinuizer.NormalizeBySpan
-#            transformer.classTreatment = data.continuization.DomainContinuizer.Ignore
-#            table = table.translate(transformer(table))
-#            return feature.imputation.AverageConstructor(table)(table)
-        else:
-            return data.Table(fname)
-
-
-    def build_model(self, learner, data):
-        """Build a classification meta-model.
-        
-        :param learner: classification learner to wrap
-        :type learner: :obj:`Orange.classification.Learner`
-        
-        :param data: data set
-        :type data: :obj:`Orange.data.Table`
-        
-        """
-
-        probabilities = []
-        instance_predictions = []
-        instance_classes = []
-        res = []
-        # estimate class probabilities using CV
-        for fold in range(self.folds):
-            learnset = data.selectref(self.indices, fold, negate=1)
-            testset = data.selectref(self.indices, fold, negate=0)
-            classifier = learner(learnset)
-            tcn = 0
-            for i in range(len(data)):
-                if (self.indices[i] == fold):
-                    ex = data.Instance(testset[tcn])
-                    ex.setclass("?")
-
-                    cr = classifier(ex, classifier.GetBoth)
-                    if cr[0].isSpecial():
-                        raise "Classifier %s returned unknown value" % (classifier.name)
-
-                    probabilities.append(np.array(list(cr[1])))
-                    instance_predictions.append(cr[0])
-                    instance_classes.append(testset[tcn].get_class())
-                    tcn += 1
-
-        return Model(type(learner).__name__,
-                     learner(data),
-                     probabilities,
-                     [x.name for x in data.domain.attributes],
-                     instance_predictions,
-                     instance_classes)
-
-    def build_projection_model(self, attributes, visualizationMethod=vr.LINEAR_PROJECTION):
-        """Build a projection meta-model."""
-
-        method = "?"
-        if visualizationMethod == vr.SCATTERPLOT:
-            import orngScaleScatterPlotData
-            graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
-            method = "SCATTERPLOT"
-        elif visualizationMethod == vr.RADVIZ:
-            import orngScaleLinProjData
-            graph = orngScaleLinProjData.orngScaleLinProjData()
-            graph.normalizeExamples = 1
-            method = "RADVIZ"
-        elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
-            import orngScaleLinProjData
-            from orngLinProj import FreeViz
-            graph = orngScaleLinProjData.orngScaleLinProjData()
-            graph.normalizeExamples = 0
-            method = "SPCA"
-        elif visualizationMethod == vr.POLYVIZ:
-            import orngScalePolyvizData
-            graph = orngScalePolyvizData.orngScalePolyvizData()
-            graph.normalizeExamples = 1
-            method = "POLYVIZ"
-        else:
-            print "an invalid visualization method was specified. VizRank can not run."
-            return
-
-        graph.setData(self.data_c, graph.rawSubsetData)
-        attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
-        domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(graph.dataDomain.class_var.name, values=getVariableValuesSorted(graph.dataDomain.class_var))])
-        classListFull = graph.originalData[graph.dataClassIndex]
-        table = None
-
-        if visualizationMethod == vr.LINEAR_PROJECTION:
-            freeviz = FreeViz(graph)
-            projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
-            if projections != None:
-                XAnchors, YAnchors, (attrNames, newIndices) = projections
-                table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
-            else:
-                print 'a null projection found'
-        elif visualizationMethod == vr.SCATTERPLOT:
-            XAnchors = YAnchors = None
-            table = graph.createProjectionAsExampleTable(attrIndices)
-        else:
-            XAnchors = graph.createXAnchors(len(attrIndices))
-            YAnchors = graph.createYAnchors(len(attrIndices))
-            validData = graph.getValidList(attrIndices)
-            # more than min number of examples
-            if np.sum(validData) >= 10:
-                classList = np.compress(validData, classListFull)
-                selectedData = np.compress(validData, np.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
-                sum_i = graph._getSum_i(selectedData)
-                table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
-
-        if not table: return None
-
-        probabilities = []
-        instance_predictions = []
-        instance_classes = []
-        learner = kNNLearner(k=10, rankWeight=0, distanceConstructor=distance.Euclidean(normalize=0))
-        for fold in range(self.folds):
-            learnset = table.selectref(self.indices, fold, negate=1)
-            testset = table.selectref(self.indices, fold, negate=0)
-            classifier = learner(learnset)
-            tcn = 0
-            for i in range(len(table)):
-                if (self.indices[i] == fold):
-                    ex = data.Instance(testset[tcn])
-                    ex.setclass("?")
-
-                    cr = classifier(ex, classifier.GetBoth)
-                    if cr[0].isSpecial():
-                        raise "Classifier %s returned unknown value" % (classifier.name)
-                    probabilities.append(np.array(list(cr[1])))
-                    instance_predictions.append(cr[0])
-                    instance_classes.append(testset[tcn].get_class())
-                    tcn += 1
-
-        return Model(method,
-                     learner(table),
-                     probabilities,
-                     attributes,
-                     np.array([c.value for c in instance_predictions]),
-                     np.array([c.value for c in instance_classes]),
-                     XAnchors=XAnchors,
-                     YAnchors=YAnchors)
-
-    def build_rf_models(self, data):
-        probabilities = [[] for fold in self.folds]
-
-        # estimate class probabilities using CV
-        for fold in range(self.folds):
-            learnset = data.selectref(indices, fold, negate=1)
-            testset = data.selectref(indices, fold, negate=0)
-
-            tree = TreeLearner(storeNodeClassifier=1,
-                       storeContingencies=0, storeDistributions=1, minExamples=5,
-                       storeExamples=1).instance()
-            gini = feature.scoring.Gini()
-            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
-            tree.maxDepth = 4
-            tree.split = ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
-            forestLearner = ensemble.forest.RandomForestLearner(learner=tree, trees=self.model_limit)
-            forestClassifier = forestLearner(learnset)
-
-            for classifier in forestClassifier.classifiers:
-                tcn = 0
-                for i in range(len(data)):
-                    if (indices[i] == fold):
-                        ex = data.Instance(testset[tcn])
-                        ex.setclass("?")
-                        tcn += 1
-                        cr = classifier(ex, classifier.GetBoth)
-                        if cr[0].isSpecial():
-                            raise "Classifier %s returned unknown value" % (classifier.name)
-                        probabilities.append(cr)
-        model_classifier = learner(data)
-        model_classifier.probabilities = probabilities
-
-
-    def _print_time(self, time_start, iter, numiter):
-        if iter % 10000 == 0:
-            time_elapsed = time.time() - time_start
-            time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
-            time_remainng = int(time_total - time_elapsed)
-            print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
-
-    def build_model_matrix(self, models, dist=distance_class):
-        """Build a distance matrix of models given the distance measure."""
-
-        dim = len(models)
-        print "%d models to matrix -- rank" % dim
-        smx = np.zeros(shape=(dim, dim))
-
-        counter = 0
-        time_start = time.time()
-        for i in range(dim):
-            for j in range(i):
-                smx[i, j] = dist(models[i], models[j])
-                counter += 1
-                self._print_time(time_start, counter, dim)
-
-        return smx
-
-    def build_model_data(self, models):
-        """Return an :obj:`Orange.data.Table` of model meta-data."""
-
-        table = get_models_table()
-        table.extend([model.get_instance(table.domain) for model in models])
-        return table
-
-    def save(self, fname, models=None, smx=None, table=None):
-        """Save model map to disk. Model similarity matrix and data table tuple 
-        is pickled and compressed as a bz2 archive.
-        
-        """
-
-        if models is None and (smx is None or table is None):
-            raise AttributeError("If models is none, smx and table must be given.")
-
-        if models is not None:
-            if type(models) != type([]):
-                raise AttributeError("Attribute models must be a list of models.")
-
-            if len(models) <= 0:
-                raise AttributeError("Attribute models is an empty list.")
-
-        if smx is None:
-            smx = self.build_model_matrix(models)
-
-        if table is None:
-            table = self.build_model_data(models)
-
-        pickle.dump((smx, table, self.data_d), bz2.BZ2File('%s.bz2' % fname, "w"), -1)
-
-    def load(self, fname):
-        """Load a model map. Read compressed tuple containing model similarity 
-        matrix and data table.
-        
-        """
-
-        smx, table, data = pickle.load(bz2.BZ2File('%s.bz2' % fname, "r"))
-        return smx, table, data