Source

orange-modelmaps / _modelmaps / modelmap.py

Full commit
"""
.. index:: model map

***************
Build Model Map
***************

.. autoclass:: mm.BuildModelMap
   :members:
   
**************
Help Functions
**************

"""

import bz2, itertools, math, random, os.path, time, uuid
import cPickle as pickle

import scipy.stats
import numpy as np

import orngVizRank as vr

from operator import itemgetter
from orngScaleData import getVariableValuesSorted
from model import Model

from Orange import data, distance, feature, ensemble
from Orange.classification.knn import kNNLearner
from Orange.classification.tree import TreeLearner

MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM"]

def distance_class(m1, m2):
    w = np.average(m1.instance_predictions != m2.instance_predictions)
    return 1 if math.isnan(w) else w

def distance_prob(m1, m2):
    ninstances = len(m1.probabilities)
    normalization_factor = 2 * ninstances

    return sum([np.sum(np.power(p1 - p2, 2)) for \
                        (p1, p2) in zip(m1.probabilities, \
                           m2.probabilities)]) / normalization_factor

def distance_rank(m1, m2):
    ninstances = len(m1.probabilities)

    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
    w = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
                        (p1, p2) in zip(m1.probabilities,
                           m2.probabilities)]) / ninstances)
    return w

def get_feature_subsets_scatterplot(domain, nsubsets):
    """Return attribute subsets for Scatter Plot."""
    attrs = []
    for i in range(len(domain.features)):
        for j in range(i):
            attrs.append((domain.features[i].name, domain.features[j].name))
    random.shuffle(attrs)

    if nsubsets > len(attrs):
        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % len(attrs))

    return attrs[:nsubsets]

def get_feature_subsets(domain, nsubsets):
    """Return random attribute subsets.
    
    :param domain: data set domain to extract features
    :type domain: :obj:`Orange.data.Domain`
    
    :param nsubsets:  number of attribute subsets
    :type nsubsets: int
    """

    def binomial(n, k):
        if n > k:
            return math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
        elif n == k:
            return 1
        else:
            return 0

    attrs = [var.name for var in domain.features]
    nattrs = len(attrs)
    total = sum(binomial(nattrs, i) for i in range(2, nattrs))

    if nsubsets > total:
        raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % total)

    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(2, nattrs))))
    selectors = [1] * nsubsets + [0] * (total - nsubsets)
    random.shuffle(selectors)
    return list(itertools.compress(combinations, selectors))

def get_models_table():
    """Return an empty data table for model meta data."""

    attrs = []
    attrs.append(feature.String("uuid"))
    varAttrs = feature.Continuous("number of attributes")
    varAttrs.numberOfDecimals = 0
    attrs.append(varAttrs)
    attrs.append(feature.Continuous("CA"))
    attrs.append(feature.Continuous("AUC"))
    attrs.append(feature.String("CA by class"))
    attrs.append(feature.Continuous("cluster CA"))
    attrs.append(feature.String("label"))
    attrs.append(feature.String("attributes"))
    attrs.append(feature.Discrete("type", values=MODEL_LIST[1:]))
    attrs.append(feature.Python("model"))
    csizes = feature.Continuous("cluster size")
    csizes.numberOfDecimals = 0
    attrs.append(csizes)

    return data.Table(data.Domain(attrs, 0))

class BuildModelMap(object):

    def __init__(self, fname, folds=10, model_limit=500):
        self.folds = folds
        self.model_limit = model_limit
        self.data_d = self.get_data(fname)
        self.data_c = self.get_data(fname, continuize=True)
        self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)

    def get_data(self, fname, continuize=False):
        """Return a data Table.
           
        :param fname: data set file name
        :type fname: string
        
        :param continuize:  if true, it tries to load a name-c.tab data table as Orange DomainContinuizer changes attribute names.
        :type continuize: bool
        
        """

        if continuize:
            base, ext = os.path.splitext(fname)
            fname = "%s-c%s" % (base, ext)

            table = data.Table(fname)
            return table
            ##############################################################################
            ## preprocess Data set
#            transformer = data.continuization.DomainContinuizer()
#            transformer.multinomialTreatment = data.continuization.DomainContinuizer.NValues
#            transformer.continuousTreatment = data.continuization.DomainContinuizer.NormalizeBySpan
#            transformer.classTreatment = data.continuization.DomainContinuizer.Ignore
#            table = table.translate(transformer(table))
#            return feature.imputation.AverageConstructor(table)(table)
        else:
            return data.Table(fname)


    def build_model(self, learner, data):
        """Build a classification meta-model.
        
        :param learner: classification learner to wrap
        :type learner: :obj:`Orange.classification.Learner`
        
        :param data: data set
        :type data: :obj:`Orange.data.Table`
        
        """

        probabilities = []
        instance_predictions = []
        instance_classes = []
        res = []
        # estimate class probabilities using CV
        for fold in range(self.folds):
            learnset = data.selectref(self.indices, fold, negate=1)
            testset = data.selectref(self.indices, fold, negate=0)
            classifier = learner(learnset)
            tcn = 0
            for i in range(len(data)):
                if (self.indices[i] == fold):
                    ex = data.Instance(testset[tcn])
                    ex.setclass("?")

                    cr = classifier(ex, classifier.GetBoth)
                    if cr[0].isSpecial():
                        raise "Classifier %s returned unknown value" % (classifier.name)

                    probabilities.append(np.array(list(cr[1])))
                    instance_predictions.append(cr[0])
                    instance_classes.append(testset[tcn].get_class())
                    tcn += 1

        return Model(type(learner).__name__,
                     learner(data),
                     probabilities,
                     [x.name for x in data.domain.attributes],
                     instance_predictions,
                     instance_classes)

    def build_projection_model(self, attributes, visualizationMethod):
        """Build a projection meta-model."""

        method = "?"
        if visualizationMethod == vr.SCATTERPLOT:
            import orngScaleScatterPlotData
            graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
            method = "SCATTERPLOT"
        elif visualizationMethod == vr.RADVIZ:
            import orngScaleLinProjData
            graph = orngScaleLinProjData.orngScaleLinProjData()
            graph.normalizeExamples = 1
            method = "RADVIZ"
        elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
            import orngScaleLinProjData
            from orngLinProj import FreeViz
            graph = orngScaleLinProjData.orngScaleLinProjData()
            graph.normalizeExamples = 0
            method = "SPCA"
        elif visualizationMethod == vr.POLYVIZ:
            import orngScalePolyvizData
            graph = orngScalePolyvizData.orngScalePolyvizData()
            graph.normalizeExamples = 1
            method = "POLYVIZ"
        else:
            print "an invalid visualization method was specified. VizRank can not run."
            return

        graph.setData(self.data_c, graph.rawSubsetData)
        attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
        domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(graph.dataDomain.class_var.name, values=getVariableValuesSorted(graph.dataDomain.class_var))])
        classListFull = graph.originalData[graph.dataClassIndex]
        table = None

        if visualizationMethod == vr.LINEAR_PROJECTION:
            freeviz = FreeViz(graph)
            projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
            if projections != None:
                XAnchors, YAnchors, (attrNames, newIndices) = projections
                table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
            else:
                print 'a null projection found'
        elif visualizationMethod == vr.SCATTERPLOT:
            XAnchors = YAnchors = None
            table = graph.createProjectionAsExampleTable(attrIndices)
        else:
            XAnchors = graph.createXAnchors(len(attrIndices))
            YAnchors = graph.createYAnchors(len(attrIndices))
            validData = graph.getValidList(attrIndices)
            # more than min number of examples
            if np.sum(validData) >= 10:
                classList = np.compress(validData, classListFull)
                selectedData = np.compress(validData, np.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
                sum_i = graph._getSum_i(selectedData)
                table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)

        if not table: return None

        probabilities = []
        instance_predictions = []
        instance_classes = []
        learner = kNNLearner(k=10, rankWeight=0, distanceConstructor=distance.Euclidean(normalize=0))
        for fold in range(self.folds):
            learnset = table.selectref(self.indices, fold, negate=1)
            testset = table.selectref(self.indices, fold, negate=0)
            classifier = learner(learnset)
            tcn = 0
            for i in range(len(table)):
                if (self.indices[i] == fold):
                    ex = data.Instance(testset[tcn])
                    ex.setclass("?")

                    cr = classifier(ex, classifier.GetBoth)
                    if cr[0].isSpecial():
                        raise "Classifier %s returned unknown value" % (classifier.name)
                    probabilities.append(np.array(list(cr[1])))
                    instance_predictions.append(cr[0])
                    instance_classes.append(testset[tcn].get_class())
                    tcn += 1

        return Model(method,
                     learner(table),
                     probabilities,
                     attributes,
                     np.array([c.value for c in instance_predictions]),
                     np.array([c.value for c in instance_classes]),
                     XAnchors=XAnchors,
                     YAnchors=YAnchors)

    def build_rf_models(self, data):
        probabilities = [[] for fold in self.folds]

        # estimate class probabilities using CV
        for fold in range(self.folds):
            learnset = data.selectref(indices, fold, negate=1)
            testset = data.selectref(indices, fold, negate=0)

            tree = TreeLearner(storeNodeClassifier=1,
                       storeContingencies=0, storeDistributions=1, minExamples=5,
                       storeExamples=1).instance()
            gini = feature.scoring.Gini()
            tree.split.discreteSplitConstructor.measure = tree.split.continuousSplitConstructor.measure = gini
            tree.maxDepth = 4
            tree.split = ensemble.forest.SplitConstructor_AttributeSubset(tree.split, 3)
            forestLearner = ensemble.forest.RandomForestLearner(learner=tree, trees=self.model_limit)
            forestClassifier = forestLearner(learnset)

            for classifier in forestClassifier.classifiers:
                tcn = 0
                for i in range(len(data)):
                    if (indices[i] == fold):
                        ex = data.Instance(testset[tcn])
                        ex.setclass("?")
                        tcn += 1
                        cr = classifier(ex, classifier.GetBoth)
                        if cr[0].isSpecial():
                            raise "Classifier %s returned unknown value" % (classifier.name)
                        probabilities.append(cr)
        model_classifier = learner(data)
        model_classifier.probabilities = probabilities


    def _print_time(self, time_start, iter, numiter):
        if iter % 10000 == 0:
            time_elapsed = time.time() - time_start
            time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
            time_remainng = int(time_total - time_elapsed)
            print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60

    def build_model_matrix(self, models, dist=distance_class):
        """Build a distance matrix of models given the distance measure."""

        dim = len(models)
        print "%d models to matrix -- rank" % dim
        smx = np.zeros(shape=(dim, dim))

        counter = 0
        time_start = time.time()
        for i in range(dim):
            for j in range(i):
                smx[i, j] = dist(models[i], models[j])
                counter += 1
                self._print_time(time_start, counter, dim)

        return smx

    def build_model_data(self, models):
        """Return an :obj:`Orange.data.Table` of model meta-data."""

        table = get_models_table()
        table.extend([model.get_instance(table.domain) for model in models])
        return table

    def save(self, fname, models=None, smx=None, table=None):
        """Save model map to disk. Model similarity matrix and data table tuple 
        is pickled and compressed as a bz2 archive.
        
        """

        if models is None and (smx is None or table is None):
            raise AttributeError("If models is none, smx and table must be given.")

        if models is not None:
            if type(models) != type([]):
                raise AttributeError("Attribute models must be a list of models.")

            if len(models) <= 0:
                raise AttributeError("Attribute models is an empty list.")

        if smx is None:
            smx = self.build_model_matrix(models)

        if table is None:
            table = self.build_model_data(models)

        pickle.dump((smx, table, self.data_d), bz2.BZ2File('%s.bz2' % fname, "w"), -1)

    def load(self, fname):
        """Load a model map. Read compressed tuple containing model similarity 
        matrix and data table.
        
        """

        smx, table, data = pickle.load(bz2.BZ2File('%s.bz2' % fname, "r"))
        return smx, table, data