Commits

Miha Stajdohar committed 133083e

All sort of bug fixes.

  • Participants
  • Parent commits e49e39e

Comments (0)

Files changed (5)

File _modelmaps/__init__.py

 
 import numpy as np
 
-from orngScaleData import getVariableValuesSorted
+#from orngScaleData import getVariableValuesSorted
 #from OWDistanceFile import readMatrix
 
 from Orange import data, feature
 from model import *
 from modelmap import *
 
-ROOT = "/home/miha/work/res/metamining/"
+#ROOT = "/home/miha/work/res/metamining/"
 #OUT_FILE = ROOT + "dst/zoo"
 #OUT_FILE = ROOT + "dst/zoo"
-OUT_FILE = ROOT + "_astra_/fprdk"
+#OUT_FILE = ROOT + "_astra_/fprdk"
 
 #def saveSymMatrix(matrix, file, items=None, saveItems=False):
 #    fn = open(file + ".dst", 'w')
 #    pickle.dump(smx.results, open('%s.res' % fn, "wb"))
 
 
-
+"""
 def evaluateProjections(vizr, attributeList):
     vizr.evaluatedProjectionsCount = 0
     vizr.optimizedProjectionsCount = 0
                 vizr.evaluatedProjectionsCount += 1
 
     return vizr.evaluatedProjectionsCount
+"""

File _modelmaps/model.py

 """
 
 import uuid
+import numpy as np
 
 from itertools import groupby
 from operator import itemgetter
 
 class Model(object):
 
-    def __init__(self, type_, classifier, probabilities, attributes, \
+    def __init__(self, type_, classifier, probabilities, \
+                 class_values, attributes, \
                  instance_predictions=None, instance_classes=None, \
                  name=None, XAnchors=None, YAnchors=None):
         """Meta-model, a node in Model Map.
         self.type = type_
         self.classifier = classifier
         self.probabilities = probabilities
+        self.class_values = class_values
         self.attributes = attributes
         self.instance_predictions = instance_predictions
         self.instance_classes = instance_classes
         inst['number of attributes'] = len(self.attributes)
         results = [p == c for p, c in zip(self.instance_predictions, self.instance_classes)]
         inst['CA'] = sum(results) / float(len(results))
+        inst['P'] = np.mean([p[self.class_values[c]] for p, c in zip(self.probabilities, self.instance_classes)])
         inst['type'] = self.type
         inst['model'] = self
         inst['attributes'] = ', '.join(self.attributes)

File _modelmaps/modelmap.py

 from orngScaleData import getVariableValuesSorted
 from model import Model
 
-from Orange import data, distance, ensemble, feature, misc
+from Orange import data, distance, ensemble, feature, misc, projection
 from Orange.classification.knn import kNNLearner
 from Orange.classification.tree import TreeLearner
 
 MODEL_LIST = ["", "SCATTERPLOT", "RADVIZ", "SPCA", "POLYVIZ", "TREE", "NaiveLearner", "kNNLearner", "SVM"]
 
+def distance_mi(m1, m2):
+    """Return inverted normalized mutual information.
+
+    1 - NMI(m1.instance_predictions, m2.instance_predictions)
+    """
+
+    classes1 = np.unique(m1.instance_predictions)
+    classes2 = np.unique(m2.instance_predictions)
+    m1_classes = [m1.instance_predictions == c for c in classes1]
+    m2_classes = [m2.instance_predictions == c for c in classes2]
+    m1_p = [np.average(m1_c1) for m1_c1 in m1_classes]
+    m2_p = [np.average(m2_c2) for m2_c2 in m2_classes]
+
+    eps = np.finfo(float).eps
+    mi = sum(sum(np.average(m1_c1 & m2_c2) * np.log2(max(np.average(m1_c1 & m2_c2) / p1 / p2, eps)) for m2_c2, p2 in zip(m2_classes, m2_p)) for m1_c1, p1 in zip(m1_classes, m1_p))
+    h1 = -sum(p * np.log2(p) for p in m1_p)
+    h2 = -sum(p * np.log2(p) for p in m2_p)
+
+    return 0 if h1 == 0 and h2 == 0 else 1. - 2 * mi / (h1 + h2)
+
 def distance_class(m1, m2):
     w = np.average(m1.instance_predictions != m2.instance_predictions)
     return 1 if math.isnan(w) else w
 
-def distance_prob(m1, m2):
-    ninstances = len(m1.probabilities)
-    normalization_factor = 2 * ninstances
+def distance_manhattan(m1, m2):
+    return np.sum(np.abs(m1.probabilities - m2.probabilities)) / 2. / len(m1.probabilities)
 
-    return sum([np.sum(np.power(p1 - p2, 2)) for \
-                        (p1, p2) in zip(m1.probabilities, \
-                           m2.probabilities)]) / normalization_factor
+def distance_euclidean(m1, m2):
+    return np.sum(np.sqrt(np.sum((m1.probabilities - m2.probabilities)**2, axis=1))) / math.sqrt(2.) / len(m1.probabilities)
 
 def distance_rank(m1, m2):
-    ninstances = len(m1.probabilities)
-
-    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=0)[0])
-    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=1)[0])
-    #w = 1 - abs(scipy.stats.spearmanr(model_probs[i], model_probs[j], axis=None)[0])
-    w = 1 - abs(sum([scipy.stats.spearmanr(p1, p2)[0] for \
-                        (p1, p2) in zip(m1.probabilities,
-                           m2.probabilities)]) / ninstances)
-    return w
+    return 1 - abs(scipy.stats.spearmanr(m1.probabilities, m2.probabilities, axis=None)[0])
 
 def get_feature_subsets_scatterplot(domain, nsubsets):
     """Return attribute subsets for Scatter Plot."""
 
     return attrs[:nsubsets]
 
-def get_feature_subsets(domain, nsubsets):
+def get_feature_subsets(domain, nsubsets=None, min_features=None, max_features=None):
     """Return random attribute subsets.
     
     :param domain: data set domain to extract features
         else:
             return 0
 
+    max_features = len(domain.features) if max_features is None else min(max_features, len(domain.features))
+    min_features = 2 if max_features is None else max(min(min_features, max_features), 2)
+
     attrs = [var.name for var in domain.features]
     nattrs = len(attrs)
-    total = sum(binomial(nattrs, i) for i in range(2, nattrs))
+    total = sum(binomial(nattrs, i) for i in range(min_features, max_features + 1))
+
+    nsubsets = total if nsubsets is None else nsubsets
 
     if nsubsets > total:
         raise AttributeError("Attribute nsubsets higher than number of possible combinations: %d." % total)
 
-    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(2, nattrs))))
+    combinations = (itertools.chain(*(itertools.combinations(attrs, i) for i in range(min_features, max_features + 1))))
     selectors = [1] * nsubsets + [0] * (total - nsubsets)
     random.shuffle(selectors)
     return list(itertools.compress(combinations, selectors))
     varAttrs.numberOfDecimals = 0
     attrs.append(varAttrs)
     attrs.append(feature.Continuous("CA"))
+    attrs.append(feature.Continuous("P"))
     attrs.append(feature.Continuous("AUC"))
     attrs.append(feature.String("CA by class"))
     attrs.append(feature.Continuous("cluster CA"))
 
     return data.Table(data.Domain(attrs, 0))
 
+
+def load(file_name):
+    """Load a model map. Read compressed tuple containing model similarity
+    matrix and data table.
+
+    """
+
+    base, ext = os.path.splitext(file_name)
+    file_name = base if ext.lower() == ".bz2" else file_name
+
+    smx, table, data = pickle.load(bz2.BZ2File('%s.bz2' % file_name, "r"))
+    return smx, table, data
+
+def save(file_name, smx, model_data, original_data):
+    """Save model map to disk. Model similarity matrix and data table tuple
+    is pickled and compressed as a bz2 archive.
+
+    """
+
+    if original_data is None or smx is None or model_data is None:
+        raise AttributeError("Distance matrix, model meta-data table, and original data table must be given.")
+
+    base, ext = os.path.splitext(file_name)
+    file_name = base if ext.lower() == ".bz2" else file_name
+
+    pickle.dump((smx, model_data, original_data), bz2.BZ2File('%s.bz2' % file_name, "w"), -1)
+
 class BuildModelMap(object):
 
     def __init__(self, fname, folds=10, model_limit=500):
         self.folds = folds
         self.model_limit = model_limit
-        self.data_d = self.get_data(fname)
-        self.data_c = self.get_data(fname, continuize=True)
+        self.data_d = self._get_data(fname)
+        #self.data_c = self._get_data(fname, continuize=True)
+        self.data_d = data.filter.IsDefined(domain=self.data_d.domain)(self.data_d)
+
         self.indices = data.sample.SubsetIndicesCV(self.data_d, self.folds, randseed=0)
 
-    def get_data(self, fname, continuize=False):
+    def _get_data(self, fname, continuize=False):
         """Return a data Table.
            
         :param fname: data set file name
 
         if continuize:
             base, ext = os.path.splitext(fname)
-            fname = "%s-c%s" % (base, ext)
+            if base[-2:] == "-c":
+                fname = "%s%s" % (base, ext)
+            else:
+                fname = "%s-c%s" % (base, ext)
 
             table = data.Table(fname)
             return table
         else:
             return data.Table(fname)
 
+    def data(self):
+        return self.data_d
 
     def build_model(self, learner, data):
         """Build a classification meta-model.
                     if cr[0].isSpecial():
                         raise "Classifier %s returned unknown value" % (classifier.name)
 
-                    probabilities.append(np.array(list(cr[1])))
+                    probabilities.append(list(cr[1]))
                     instance_predictions.append(cr[0])
                     instance_classes.append(testset[tcn].get_class())
                     tcn += 1
 
         return Model(type(learner).__name__,
                      learner(data),
-                     probabilities,
+                     np.array(probabilities),
+                     {val: i for i, val in enumerate(self.data_d.domain.class_var.values)},
                      [x.name for x in data.domain.attributes],
                      instance_predictions,
                      instance_classes)
 
         method = "?"
         if visualizationMethod == vr.SCATTERPLOT:
-            import orngScaleScatterPlotData
-            graph = orngScaleScatterPlotData.orngScaleScatterPlotData()
+            graph = data.preprocess.scaling.ScaleScatterPlotData()
             method = "SCATTERPLOT"
         elif visualizationMethod == vr.RADVIZ:
-            import orngScaleLinProjData
-            graph = orngScaleLinProjData.orngScaleLinProjData()
+            graph = data.preprocess.scaling.ScaleLinProjData()
             graph.normalizeExamples = 1
             method = "RADVIZ"
         elif visualizationMethod in [vr.LINEAR_PROJECTION, vr.KNN_IN_ORIGINAL_SPACE]:
-            import orngScaleLinProjData
-            from orngLinProj import FreeViz
-            graph = orngScaleLinProjData.orngScaleLinProjData()
+            graph = data.preprocess.scaling.ScaleLinProjData()
             graph.normalizeExamples = 0
             method = "SPCA"
         elif visualizationMethod == vr.POLYVIZ:
-            import orngScalePolyvizData
-            graph = orngScalePolyvizData.orngScalePolyvizData()
+            graph = data.preprocess.scaling.ScalePolyvizData()
             graph.normalizeExamples = 1
             method = "POLYVIZ"
         else:
             print "an invalid visualization method was specified. VizRank can not run."
             return
 
-        graph.setData(self.data_c, graph.rawSubsetData)
+        graph.setData(self.data_d, graph.rawSubsetData)
         attrIndices = [graph.attributeNameIndex[attr] for attr in attributes]
-        domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), feature.Discrete(graph.dataDomain.class_var.name, values=getVariableValuesSorted(graph.dataDomain.class_var))])
-        classListFull = graph.originalData[graph.dataClassIndex]
+        #domain = data.Domain([feature.Continuous("xVar"), feature.Continuous("yVar"), graph.dataDomain.class_var])
+        classListFull = graph.original_data[graph.dataClassIndex]
         table = None
 
         if visualizationMethod == vr.LINEAR_PROJECTION:
-            freeviz = FreeViz(graph)
-            projections = freeviz.findProjection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
+            freeviz = projection.linear.FreeViz(graph)
+            projections = freeviz.find_projection(vr.PROJOPT_SPCA, attrIndices, set_anchors=0, percent_data_used=100)
             if projections != None:
                 XAnchors, YAnchors, (attrNames, newIndices) = projections
-                table = graph.createProjectionAsExampleTable(newIndices, domain=domain, XAnchors=XAnchors, YAnchors=YAnchors)
+                table = graph.create_projection_as_example_table(newIndices, XAnchors=XAnchors, YAnchors=YAnchors)
             else:
                 print 'a null projection found'
         elif visualizationMethod == vr.SCATTERPLOT:
             XAnchors = YAnchors = None
-            table = graph.createProjectionAsExampleTable(attrIndices)
+            table = graph.create_projection_as_example_table(attrIndices)
         else:
-            XAnchors = graph.createXAnchors(len(attrIndices))
-            YAnchors = graph.createYAnchors(len(attrIndices))
-            validData = graph.getValidList(attrIndices)
+            XAnchors = graph.create_xanchors(len(attrIndices))
+            YAnchors = graph.create_yanchors(len(attrIndices))
+            validData = graph.get_valid_list(attrIndices)
             # more than min number of examples
             if np.sum(validData) >= 10:
                 classList = np.compress(validData, classListFull)
-                selectedData = np.compress(validData, np.take(graph.noJitteringScaledData, attrIndices, axis=0), axis=1)
+                selectedData = np.compress(validData, np.take(graph.no_jittering_scaled_data, attrIndices, axis=0), axis=1)
                 sum_i = graph._getSum_i(selectedData)
-                table = graph.createProjectionAsExampleTable(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors, domain=domain)
+                table = graph.create_projection_as_example_table(attrIndices, validData=validData, classList=classList, sum_i=sum_i, XAnchors=XAnchors, YAnchors=YAnchors)
 
         if not table: return None
 
         probabilities = []
         instance_predictions = []
         instance_classes = []
-        learner = kNNLearner(k=10, rankWeight=0, distanceConstructor=distance.Euclidean(normalize=0))
+        learner = kNNLearner(k=0, rankWeight=0, distanceConstructor=distance.Euclidean(normalize=0))
+
         for fold in range(self.folds):
             learnset = table.selectref(self.indices, fold, negate=1)
             testset = table.selectref(self.indices, fold, negate=0)
             classifier = learner(learnset)
-            tcn = 0
-            for i in range(len(table)):
-                if (self.indices[i] == fold):
-                    ex = data.Instance(testset[tcn])
-                    ex.setclass("?")
 
-                    cr = classifier(ex, classifier.GetBoth)
-                    if cr[0].isSpecial():
-                        raise "Classifier %s returned unknown value" % (classifier.name)
-                    probabilities.append(np.array(list(cr[1])))
-                    instance_predictions.append(cr[0])
-                    instance_classes.append(testset[tcn].get_class())
-                    tcn += 1
+            for test_ex in testset:
+                ex = data.Instance(test_ex)
+                ex.setclass("?")
+
+                cl, prob = classifier(ex, classifier.GetBoth)
+                if cl.isSpecial():
+                    raise "Classifier %s returned unknown value" % (classifier.name)
+                probabilities.append(list(prob))
+                instance_predictions.append(cl.value)
+                instance_classes.append(test_ex.get_class().value)
 
         return Model(method,
                      learner(table),
-                     probabilities,
+                     np.array(probabilities),
+                     {val: i for i, val in enumerate(self.data_d.domain.class_var.values)},
                      attributes,
-                     np.array([c.value for c in instance_predictions]),
-                     np.array([c.value for c in instance_classes]),
+                     np.array(instance_predictions),
+                     np.array(instance_classes),
                      XAnchors=XAnchors,
                      YAnchors=YAnchors)
 
 
 
     def _print_time(self, time_start, iter, numiter):
-        if iter % 10000 == 0:
-            time_elapsed = time.time() - time_start
-            time_total = time_elapsed / iter * numiter * (numiter - 1) / 2
-            time_remainng = int(time_total - time_elapsed)
-            print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
+        time_elapsed = time.time() - time_start
+        time_total = time_elapsed / iter * (numiter * (numiter-1) / 2.)
+        time_remainng = int(time_total - time_elapsed)
+        print iter, '/', numiter * (numiter - 1) / 2, '| remaining:', time_remainng / 60 / 60, ':', time_remainng / 60 % 60, ':', time_remainng % 60
 
-    def build_model_matrix(self, models, dist=distance_class):
+    def build_model_matrix(self, models, dist=distance_manhattan):
         """Build a distance matrix of models given the distance measure."""
 
         dim = len(models)
-        print "%d models to matrix -- rank" % dim
-#        smx = np.zeros(shape=(dim, dim))
-#        counter = 0
-#        time_start = time.time()
-#        for i in range(dim):
-#            for j in range(i):
-#                smx[i, j] = dist(models[i], models[j])
-#                counter += 1
-#                self._print_time(time_start, counter, dim)
+        print "%d models to matrix -- %s" % (dim, dist.__name__)
+        smx = misc.SymMatrix(dim)
+        counter = 0
+        time_start = time.time()
+        for i in range(dim):
+            for j in range(i):
+                smx[i, j] = dist(models[i], models[j])
 
-        smx = misc.SymMatrix([[dist(models[i], models[j]) for j in range(i)] for i in range(dim)])
+            counter += i
+            if (i+1) % 1000 == 0:
+                self._print_time(time_start, counter, dim)
+
         return smx
 
     def build_model_data(self, models):
         table = get_models_table()
         table.extend([model.get_instance(table.domain) for model in models])
         return table
-
-    def save(self, fname, models=None, smx=None, table=None):
-        """Save model map to disk. Model similarity matrix and data table tuple 
-        is pickled and compressed as a bz2 archive.
-        
-        """
-
-        if models is None and (smx is None or table is None):
-            raise AttributeError("If models is none, smx and table must be given.")
-
-        if models is not None:
-            if type(models) != type([]):
-                raise AttributeError("Attribute models must be a list of models.")
-
-            if len(models) <= 0:
-                raise AttributeError("Attribute models is an empty list.")
-
-        if smx is None:
-            smx = self.build_model_matrix(models)
-
-        if table is None:
-            table = self.build_model_data(models)
-
-        pickle.dump((smx, table, self.data_d), bz2.BZ2File('%s.bz2' % fname, "w"), -1)
-
-    def load(self, fname):
-        """Load a model map. Read compressed tuple containing model similarity 
-        matrix and data table.
-        
-        """
-
-        smx, table, data = pickle.load(bz2.BZ2File('%s.bz2' % fname, "r"))
-        return smx, table, data

File _modelmaps/widgets/OWModelFile.py

         self.file_index = 0
 
         self.matrix = None
+        self.matrices = None
         self.model_data = None
         self.original_data = None
+        self.selected_matrix = None
 
         self.loadSettings()
 
         button.setIcon(self.style().standardIcon(QStyle.SP_DirOpenIcon))
         button.setSizePolicy(QSizePolicy.Maximum, QSizePolicy.Fixed)
 
+        self.propertiesBox = OWGUI.widgetBox(self.controlArea, "Properties", addSpace=True)
+        self.select_matrix_combo = OWGUI.comboBox(self.propertiesBox, self, "selected_matrix", label='Select matrix:', orientation='horizontal', callback=self.select_matrix)
+        self.select_matrix_combo.setEnabled(False)
 
 #        Moved to SymMatrixTransform widget
 #
         if self.files:
             self.loadFile()
 
+    def select_matrix(self):
+        self.matrix = self.matrices[str(self.select_matrix_combo.currentText())]
+        self.relabel()
+
     def browseFile(self):
         if self.files:
             lastPath = os.path.split(self.files[0])[0]
             fn = self.files[0]
 
         self.filecombo.clear()
+        self.select_matrix_combo.clear()
+        self.select_matrix_combo.setEnabled(False)
+
         for file in self.files:
             self.filecombo.addItem(os.path.split(file)[1])
         #self.filecombo.updateGeometry()
 
         self.matrix = None
+        self.matrices = None
         self.model_data = None
         self.original_data = None
         pb = OWGUI.ProgressBar(self, 100)
         self.error()
         try:
             matrix, self.model_data, self.original_data = pickle.load(bz2.BZ2File('%s' % fn, "r"))
-            if type(matrix) == type(misc.SymMatrix(1)):
+            if type(matrix) == type({}):
+                for name in matrix.iterkeys():
+                    self.select_matrix_combo.addItem(name)
+
+                self.matrices = matrix
+                self.matrix = matrix[str(self.select_matrix_combo.currentText())]
+                self.select_matrix_combo.setEnabled(True)
+
+            elif type(matrix) == type(misc.SymMatrix(1)):
                 self.matrix = matrix
+                self.select_matrix_combo.addItem("Single matrix found")
             else:
                 try:
                     self.matrix = misc.SymMatrix(matrix)
+                    self.select_matrix_combo.addItem("Single matrix found")
                 except TypeError:
                     self.matrix = misc.SymMatrix(matrix + matrix.T)
+                    self.select_matrix_combo.addItem("Single matrix found")
 
         except Exception, ex:
             self.error("Error while reading the file: '%s'" % str(ex))

File _modelmaps/widgets/OWModelMap.py

            not isinstance(self.items, data.Table):
             return
 
-        attributes = ["Cluster CA", "label", "CA", "attributes"]
+        attributes = ["Cluster CA", "label", "P", "attributes"]
 
 #        lbl  = "%s\n" % self.graph.items()[vertex.index]["label"].value
 #        lbl += "CA: %.4g\n" % self.graph.items()[vertex.index]["CA"].value
             clusters[val] = clusters.get(val, []) + [key]
 
         items = self._network.items()
-        self._representatives = {max(val, key=lambda x, items=items: items[x]["CA"].value): val for val in clusters.itervalues()}
+        self._representatives = {max(val, key=lambda x, items=items: items[x]["P"].value): val for val in clusters.itervalues()}
         #for key, val in clusters.items():
         #    representatives.append(max(val, key=lambda x, items=items: items[x]["CA"].value))
         # find neighbors for all representatives