Commits

Marko Toplak committed 643d378

obiGeneSetSig: speedup of ASSESS.

Comments (0)

Files changed (2)

_bioinformatics/obiGeneSetSig.py

         self.rankingf = rankingf
         if self.rankingf == None:
             self.rankingf = AT_edelmanParametricLearner()
+        self.example_buffer = {}
+        self.attransv = 0
         super(Assess, self).__init__(**kwargs)
 
+    def _ordered_and_lcor(self, ex, nm, name_ind, attrans, attransv):
+        """ Buffered! It should be computed only once per example. """ 
+        #name_ind and nm are always co-created, so I need to have only one as a key
+        key = (ex, nm, attransv)
+        if key not in self.example_buffer:
+            ex_atts = [ at.name for at in ex.domain.attributes ]
+            new_atts = [ name_ind[nm.umatch(an)] if nm.umatch(an) != None else None
+                for an in ex_atts ]
+
+            #new_atts: indices of genes in original data for that sample 
+            #POSSIBLE REVERSE IMPLEMENTATION (slightly different
+            #for data from different chips):
+            #save pairs together and sort (or equiv. dictionary transformation)
+
+            indexes = filter(lambda x: x[0] != None, zip(new_atts, range(len(ex_atts))))
+
+            lcor = [ attrans[index_in_data](ex[index_in_ex].value) 
+                for index_in_data, index_in_ex in indexes if
+                ex[index_in_ex].value != '?' ]
+            #indexes in original lcor, sorted from higher to lower values
+            ordered = obiGsea.orderedPointersCorr(lcor)
+            rev2 = numpy.argsort(ordered)
+            self.example_buffer[key] = lcor,ordered,rev2
+        return self.example_buffer[key]
+
     def build_features(self, data, gene_sets):
 
         attributes = []
 
         #attrans: { i_orig: ranking_function }
         attrans = [ self.rankingf(iat, data) for iat, at in enumerate(data.domain.attributes) ]
+        attransv = self.attransv
+        self.attransv += 1
 
         nm_all, _ =  self._mat_ni(data)
 
 
             geneset = list(gs.genes)
             nm, name_ind, genes, takegenes, to_geneset = self._match_data(data, geneset, odic=True)
+            takegenes = [ geneset[i] for i in takegenes ]
             genes = set(genes)
-            
-            def t(ex, w, geneset=geneset, takegenes=takegenes, nm=nm, attrans=attrans):
 
-                nm2, name_ind2, genes2 = self._match_instance(ex, geneset, takegenes)
+            def t(ex, w, takegenes=takegenes, nm=nm, attrans=attrans, attransv=attransv):
 
-                ex_atts = [ at.name for at in ex.domain.attributes ]
-                new_atts = [ name_ind[nm.umatch(an)] if nm.umatch(an) != None else None
-                    for an in ex_atts ]
-                #new_atts: indices of genes in original data for that sample 
-                #POSSIBLE REVERSE IMPLEMENTATION (slightly different
-                #for data from different chips):
-                #save pairs together and sort (or equiv. dictionary transformation)
+                nm2, name_ind2, genes2 = self._match_instance(ex, takegenes)
+                lcor, ordered, rev2 = self._ordered_and_lcor(ex, nm, name_ind, attrans, attransv)
 
-                indexes = filter(lambda x: x[0] != None, zip(new_atts, range(len(ex_atts))))
-
-                lcor = [ attrans[index_in_data](ex[index_in_ex].value) 
-                    for index_in_data, index_in_ex in indexes if
-                    ex[index_in_ex].value != '?' ]
-                #indexes in original lcor, sorted from higher to lower values
-                ordered = obiGsea.orderedPointersCorr(lcor) 
                 #subset = list of indices, lcor = correlations, ordered = order
                 subset = [ name_ind2[g] for g in genes2 ]
-                return obiGsea.enrichmentScoreRanked(subset, lcor, ordered)[0] 
+                return obiGsea.enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0] 
 
             at.get_value_from = t
             attributes.append(at)

_bioinformatics/obiGsea.py

     their lcor[i] value. Higher correlations first.
     """
     ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
-    ordered.sort(lambda x,y: cmp(y[1],x[1])) #sort by correlation, descending
+    ordered.sort(key=lambda x: -x[1]) #sort by correlation, descending
     ordered = nth(ordered, 0) #contains positions in the original list
     return ordered
 
     #add if gene is not in the subset
     notInA = -(1. / (len(lcor)-len(subset)))
     #base for addition if gene is in the subset
-    cors = [ abs(lcor[i])**p for i in subset ]
+
+    cors = [ abs(lcor[i])**p for i in subset ] #belowe in numpy
     sumcors = sum(cors)
 
     #this should not happen
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.