Commits

Marko Toplak  committed 2b90e7c

Fixed a bug in obiGene (umatch did not work in Match objects). Gene set signatures method now accept examples from different domains on the input.

  • Participants
  • Parent commits 9bc83a6

Comments (0)

Files changed (2)

         return self.matcho.explain(gene)
 
 class Match(object):
-    pass
+
+    def umatch(self, gene):
+        """Returns an unique (only one matching target) target or None"""
+        mat = self.match(gene)
+        return mat[0] if len(mat) == 1 else None
  
 class MatchAliases(Match):
 

File obiGeneSetSig.py

 import obiGene
 import numpy
 from collections import defaultdict
-import statc
 import stats
+import obiGsea
 
-def selectGenesetsData(data, matcher, geneSets, minSize=3, maxSize=1000, minPart=0.1, classValues=None):
-    """
-    Returns gene sets and data which falling under upper criteria.
-    """
-    gso = obiGsea.GSEA(data, matcher=matcher, classValues=classValues, atLeast=0)
-    gso.addGenesets(geneSets)
-    okgenesets = gso.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart).keys()
-    gsetsnum = gso.to_gsetsnum(okgenesets)
-    return gso.data, okgenesets, gsetsnum
+def setSig_example_geneset(ex, data):
+    """ Gets learning data and example with the same domain, both
+    containing only genes from the gene set. """
+
+    distances = [ [], [] ]    
+
+    def pearsonr(v1, v2):
+        return numpy.corrcoef([v1, v2])[0,1]
+
+    def pearson(ex1, ex2):
+        #leaves undefined elements out
+
+        attrs = range(len(ex1.domain.attributes))
+        vals1 = [ ex1[i].value for i in attrs ]
+        vals2 = [ ex2[i].value for i in attrs ]
+
+        common = [ True if v1 != "?" and v2 != "?" else False \
+            for v1,v2 in zip(vals1,vals2) ]
+        vals1 = [ v for v,c in zip(vals1, common) if c ]
+        vals2 = [ v for v,c in zip(vals2, common) if c ]
+
+        return pearsonr(vals1, vals2)
+
+    def ttest(ex1, ex2):
+        try:
+            return stats.lttest_ind(ex1, ex2)[0]
+        except:
+            return 0.0
+    
+    #maps class value to its index
+    classValueMap = dict( [ (val,i) for i,val in enumerate(data.domain.class_var.values) ])
+ 
+    #create distances to all learning data - save or other class
+    for c in data:
+        distances[classValueMap[c[-1].value]].append(pearson(c, ex))
+
+    return ttest(distances[0], distances[1])
+
+def mat_ni(data, matcher):
+    nm = matcher([at.name for at in data.domain.attributes])
+    name_ind = dict((n.name,i) for i,n in enumerate(data.domain.attributes))
+    return nm, name_ind
+
+def select_genesets(nm, gene_sets, min_size=3, max_size=1000, min_part=0.1):
+    """ Returns a list of gene sets that have sizes in limits """
+
+    def ok_sizes(gs):
+        """compares sizes of genesets to limitations"""
+        transl = filter(lambda x: x != None, [ nm.umatch(gene) for gene in gs.genes ])
+        if len(transl) >= min_size \
+            and len(transl) <= max_size \
+            and float(len(transl))/len(gs.genes) >= min_part:
+            return True
+        return False
+
+    return filter(ok_sizes, gene_sets) 
 
 class SetSig(object):
 
         self.class_values = class_values
 
     def __call__(self, data, weight_id=None):
-        data, oknames, gsetsnum = obiAssess.selectGenesetsData(data, 
-            self.matcher, self.gene_sets,
-            minSize=self.min_size, maxSize=self.max_size, 
-            minPart=self.min_part, classValues=self.class_values)
 
-        def setSig_example_geneset(ex, data):
-            """ ex contains only selected genes """
-
-            distances = [ [], [] ]    
-
-            def pearsonr(v1, v2):
-                try:
-                    return statc.pearsonr(v1, v2)[0]
-                except:
-                    return numpy.corrcoef([v1, v2])[0,1]
-
-            def pearson(ex1, ex2):
-                attrs = range(len(ex1.domain.attributes))
-                vals1 = [ ex1[i].value for i in attrs ]
-                vals2 = [ ex2[i].value for i in attrs ]
-                return pearsonr(vals1, vals2)
-
-            def ttest(ex1, ex2):
-                try:
-                    return stats.lttest_ind(ex1, ex2)[0]
-                except:
-                    return 0.0
-            
-            #maps class value to its index
-            classValueMap = dict( [ (val,i) for i,val in enumerate(data.domain.classVar.values) ])
-         
-            #create distances to all learning data - save or other class
-            for c in data:
-                distances[classValueMap[c[-1].value]].append(pearson(c, ex))
-
-            return ttest(distances[0], distances[1])
+        data = obiGsea.takeClasses(data, classValues=self.class_values)
+        nm,_ =  mat_ni(data, matcher)
+        gene_sets = select_genesets(nm, self.gene_sets, self.min_size, self.max_size, self.min_part)
 
         attributes = []
 
-        for name, gs in gsetsnum.items(): #for each geneset
-            #for each gene set: take the attribute subset and work on the attribute subset only
-            #only select the subset of genes from the learning data
-            at = Orange.feature.Continuous(name=name.id)
+        for gs in gene_sets:
+            at = Orange.feature.Continuous(name=gs.id)
 
-            def t(ex, w, gs=gs, ldata=data):
-                domain = Orange.data.Domain([ldata.domain.attributes[ai] for ai in gs], ldata.domain.classVar)
-                datao = Orange.data.Table(domain, ldata)
-                example = Orange.data.Instance(domain, ex) #domains need to be the same
-                return setSig_example_geneset(example, datao)
+            def t(ex, w, gs=gs, data=data): #copy od the data
+                geneset = list(gs.genes)
+
+                nm, name_ind = mat_ni(data, matcher)
+                nm2, name_ind2 = mat_ni(ex, matcher)
+
+                genes = [ nm.umatch(gene) for gene in geneset ]
+                genes2 = [ nm2.umatch(gene) for gene in geneset ]
+
+                genes, genes2 = zip(*[ (g,g2) for g,g2 in zip(genes, genes2) if g != None])
+
+                domain = Orange.data.Domain([data.domain.attributes[name_ind[gene]] for gene in genes], data.domain.class_var)
+                datao = Orange.data.Table(domain, data)
+
+                def vou(ex, gn, indices):
+                    """ returns the value or unknown for the given gene name"""
+                    if gn not in indices:
+                        return "?"
+                    else:
+                        return ex[indices[gn]].value
+                
+                #convert the example to the same domain
+                exvalues = [ vou(ex, gn, name_ind2) for gn in genes2 ] + [ "?" ]
+                example = Orange.data.Instance(domain, exvalues)
+
+                return setSig_example_geneset(example, datao) #only this one is setsig specific
          
             at.get_value_from = t
             attributes.append(at)
     matcher = obiGene.matcher([])
 
     choosen_cv = ["Iris-setosa", "Iris-versicolor"]
+
     def to_old_dic(d, data):
         ar = defaultdict(list)
         for ex1 in data:
         ol =  sorted(ar.items())
         print '\n'.join([ a + ": " +str(b) for a,b in ol])
 
-    ass = SetSig(ldata, matcher=matcher, gene_sets=gsets, class_values=choosen_cv, min_part=0.0)
-    print ass.domain
+    ass = SetSig(matcher=matcher, gene_sets=gsets, class_values=choosen_cv, min_part=0.0)
+    ass = ass(data)
     ar = to_old_dic(ass.domain, data[:5])
     pp2(ar)