Commits

Flashpoint committed 736a0fc Merge

Merged biolab/orange-bioinformatics into default

  • Participants
  • Parent commits a94aec2, 023037d

Comments (0)

Files changed (2)

_bioinformatics/obiGeneSetSig.py

         return self._cache[data.domain]
 
     def _match_instance(self, instance, geneset, takegenes=None):
+        """
+        Return
+            - a gene matcher with the instance as a target
+            - { name: attribute indices } of an instance
+            - genes names on the data set that were matched by the gene set
+
+        If takegenes is a list of indices, use only genes from
+        the gene set with specified indices.
+        """
         nm, name_ind = self._mat_ni(instance)
         genes = [ nm.umatch(gene) for gene in geneset ]
         if takegenes:
             # The domain has the transformer that is build on all samples,
             # while the transformed data table uses cross-validation
             # internally
-            folds = 5
-            cvi = Orange.data.sample.SubsetIndicesCV(data, folds)
+            if self.cv == True:
+                cvi = Orange.data.sample.SubsetIndicesCV(data, 5)
+            elif self.cv != False:
+                cvi = self.cv(data)
             data_cv = [ [] for _ in range(len(data)) ]
-            for f in range(folds):
+            for f in set(cvi):
                 learn = data.select(cvi, f, negate=True)
                 test = data.select(cvi, f)
                 lf = self.build_features(learn, gene_sets)
             #print p1, p2, "exception"
             return 0
 
+def estimate_gaussian_per_class(data, i, a=None, b=None, common_if_extreme=False):
+    cv = data.domain.class_var
+
+    if a == None: a = cv.values[0]
+    if b == None: b = cv.values[1]
+
+    def avWCVal(value):
+        return [ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ]
+
+    list1 = avWCVal(a)
+    list2 = avWCVal(b)
+
+    mi1 = mi2 = st1 = st2 = None
+
+    try:
+        mi1 = statc.mean(list1)
+        st1 = statc.std(list1)
+    except:
+        pass
+
+    try:
+        mi2 = statc.mean(list2)
+        st2 = statc.std(list2)
+    except:
+        pass
+
+    def extreme():
+        return st1 == 0 or st2 == 0
+    
+    if common_if_extreme and extreme():
+        print "extreme", st1, st2,
+        st1 = st2 = statc.std(list1 + list2)
+        print "new", st1, st2
+
+    return mi1, st1, mi2, st2
+
 class AT_edelmanParametricLearner(object):
     """
     Returns attribute transfromer for Edelman parametric measure for a
         self.b = b
 
     def __call__(self, i, data):
-        cv = data.domain.classVar
-        #print data.domain
+        cv = data.domain.class_var
 
         if self.a == None: self.a = cv.values[0]
         if self.b == None: self.b = cv.values[1]
 
-        def avWCVal(value):
-            return [ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ]
-
-        list1 = avWCVal(self.a)
-        list2 = avWCVal(self.b)
-
-        mi1 = mi2 = st1 = st2 = None
-
-        try:
-            mi1 = statc.mean(list1)
-            st1 = statc.std(list1)
-        except:
-            pass
-    
-        try:
-            mi2 = statc.mean(list2)
-            st2 = statc.std(list2)
-        except:
-            pass
+        mi1, st1, mi2, st2 = estimate_gaussian_per_class(data, i, a=self.a, b=self.b)
 
         return AT_edelmanParametric(mi1=mi1, mi2=mi2, st1=st1, st2=st2)
 
 
         return attributes
    
-def setSig_example_geneset(ex, data, no_unknowns):
+def setSig_example_geneset(ex, data, no_unknowns, check_same=False):
     """ Gets learning data and example with the same domain, both
     containing only genes from the gene set. """
 
         vals1 = ex1.native(0)[:-1]
         vals2 = ex2.native(0)[:-1]
 
+        if check_same and vals1 == vals2:
+            return 10 #they are the same
+
         #leaves undefined elements out
         if not no_unknowns:
             common = [ True if v1 != "?" and v2 != "?" else False \
  
     #create distances to all learning data - save or other class
     for c in data:
-        distances[classValueMap[c[-1].value]].append(pearson(c, ex))
+        p = pearson(c, ex)
+        if p != 10:
+             distances[classValueMap[c[-1].value]].append(pearson(c, ex))
 
     return ttest(distances[0], distances[1])
 
 def mat_ni(data, matcher):
+    """ Return (in a tuple):
+        - a gene matcher that matches to the attribute names of data
+        - a dictionary attribute names -> indices in the data set.
+    """
     nm = matcher([at.name for at in data.domain.attributes])
     name_ind = dict((n.name,i) for i,n in enumerate(data.domain.attributes))
     return nm, name_ind
 
     def __init__(self, **kwargs):
         self.no_unknowns = kwargs.pop("no_unknowns", False)
+        self.check_same = kwargs.pop("check_same", False)
         super(SetSig, self).__init__(**kwargs)
 
     def build_feature(self, data, gs):
             exvalues = [ vou(ex, gn, name_ind2) for gn in genes2 ] + [ "?" ]
             example = Orange.data.Instance(domain, exvalues)
 
-            return setSig_example_geneset(example, datao, self.no_unknowns) #only this one is setsig specific
+            return setSig_example_geneset(example, datao, self.no_unknowns, check_same=self.check_same) #only this one is setsig specific
      
         at.get_value_from = t
         return at
             ex = Orange.data.Instance(domain, exvalues)
 
             return self._use_par(ex, constructt)
-            
+        
         at.get_value_from = t
+        at.dbg = constructt #for debugging
+        
         return at
 
 class PLS(ParametrizedTransformation):
     bg = 1
     for a in range(2, len(sortedinds)+1):
         tg = S(sortedinds[:a])
-        if tg > g:
+        if tg > g: #improvement
             g = tg
             bg = a
         else:
             break
         
-    return sortedinds[:a]
+    return sortedinds[:bg] #FIXED - one too many was taken
 
 class CORGs(ParametrizedTransformation):
     """
         at.get_value_from = t
         return at
 
+def compute_llr(data, inds, cache):
+    """
+    Compute CORG for this geneset specified with gene inds
+    in the example table. Output is the list of gene inds
+    in CORG.
+    """
+    def gaussianc(data, at, cache=None):
+        """ Cached attribute  tscore calculation """
+        if cache != None and at in cache: return cache[at]
+        ma = estimate_gaussian_per_class(data, at, common_if_extreme=True)
+        if cache != None: cache[at] = ma
+        return ma
+
+    gf = [ gaussianc(data, at, cache) for at in inds ]
+    return gf
+
+""" To avoid scipy overhead """
+from math import pi
+_norm_pdf_C = math.sqrt(2*pi)
+_norm_pdf_logC = math.log(_norm_pdf_C)
+def _norm_logpdf(x, mi, std):
+    return -(x-mi)**2 / (2.0*std**2) - _norm_pdf_logC - math.log(std)
+
+def _llrlogratio(v, mi1, std1, mi2, std2):
+    if mi1 == None or std1 == None or mi2 == None or std2 == None:
+        return 0. #problem with estimation
+    #lpdf1 = scipy.stats.norm.logpdf(v, mi1, std1)
+    #lpdf2 = scipy.stats.norm.logpdf(v, mi2, std2)
+    lpdf1 = _norm_logpdf(v, mi1, std1) #avoids scipy's overhead
+    lpdf2 = _norm_logpdf(v, mi2, std2)
+    r = lpdf1 - lpdf2
+    return r
+
+class LLR(ParametrizedTransformation):
+    """ 
+    From Su et al: Accurate and Reliable Cancer Classification Base
+    on Probalistic Inference of Pathway Activity. Plos One, 2009.
+    """
+
+    def __init__(self, **kwargs):
+        self.normalize = kwargs.pop("normalize", False) #normalize final results
+        super(LLR, self).__init__(**kwargs)
+
+    def __call__(self, *args, **kwargs):
+        self._gauss_cache = {} #caching of gaussian estimates
+        self._normalizec = {}
+        return super(LLR, self).__call__(*args, **kwargs)
+
+    def build_feature(self, data, gs):
+
+        at = Orange.feature.Continuous(name=str(gs))
+        geneset = list(gs.genes)
+
+        nm, name_ind, genes, takegenes, to_geneset = self._match_data(data, geneset, odic=True)
+
+        gsi = [ name_ind[g] for g in genes ]
+        gausse = compute_llr(data, gsi, self._gauss_cache)
+        genes_gs = [ to_geneset[g] for g in genes ]
+
+        if self.normalize: # per (3) in the paper
+            #compute log ratios for all samples and genes from this gene set
+            for i, gene_gs, g in zip(gsi, genes_gs, gausse):
+                if gene_gs not in self._normalizec: #skip if computed already
+                    r = [ _llrlogratio(ex[i].value, *g) for ex in data ]
+                    self._normalizec[gene_gs] = (statc.mean(r), statc.std(r))
+
+        def t(ex, w, genes_gs=genes_gs, gausse=gausse, normalizec=self._normalizec):
+            nm2, name_ind2, genes2 = self._match_instance(ex, genes_gs, None)
+            gsvalues = [ vou(ex, gn, name_ind2) for gn in genes2 ]
+
+            vals = [ _llrlogratio(v, *g) if v != "?" else 0.0 for v,g in zip(gsvalues, gausse) ]
+
+            if len(normalizec): #normalize according to (3)
+                vals2 = []
+                for v,g in zip(vals, genes_gs):
+                    m,s = self._normalizec[g]
+                    vals2.append((v-m)/s)
+                vals = vals2
+            
+            return sum(vals)
+
+     
+        at.get_value_from = t
+        return at
+
 if __name__ == "__main__":
 
     data = Orange.data.Table("iris")

server_update/updatemiRNA.py

         sendMail('Uncorrect format of miRBase data-file.')        
         return False
 
+IDpat = re.compile('ID\s*(\S*)\s*standard;')
+ACpat = re.compile('AC\s*(\S*);')
+RXpat = re.compile('RX\s*PUBMED;\s(\d*).')
+FT1pat = re.compile('FT\s*miRNA\s*(\d{1,}\.\.\d{1,})')
+FT2pat = re.compile('FT\s*/accession="(MIMAT[0-9]*)"')
+FT3pat = re.compile('FT\s*/product="(\S*)"')
+SQpat = re.compile('SQ\s*(.*other;)')
+seqpat = re.compile('\s*([a-z\s]*)\s*\d*')
+
+
     
 def get_intoFiles(path, data_webPage):
     
             for r in rows:
                 
                 if r[0:2] == 'ID':
-                    preID = str(re.findall('ID\s*(\S*)\s*standard;',r)[0])
+                    preID = str(IDpat.findall(r)[0])
                     print preID
                         
                 elif r[0:2] == 'AC':
-                    preACC = str(re.findall('AC\s*(\S*);',r)[0])
+                    preACC = str(ACpat.findall(r)[0])
                     web_addr = 'http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s' % preACC
                         
-                elif r[0:2] == 'RX' and not(re.findall('RX\s*PUBMED;\s(\d*).',r)==[]):
-                    pubIDs.append(str(re.findall('RX\s*PUBMED;\s(\d*).',r)[0]))
-                            
-                elif r[0:2]=='FT' and not(re.findall('FT\s*miRNA\s*(\d{1,}\.\.\d{1,})',r)==[]):
-                    loc_mat = str(re.findall('FT\s*miRNA\s*(\d{1,}\.\.\d{1,})',r)[0])
+                elif r[0:2] == 'RX' and not(RXpat.findall(r)==[]):
+                    pubIDs.append(str(RXpat.findall(r)[0]))
+
+                elif r[0:2]=='FT' and not(FT1pat.findall(r)==[]):
+                    loc_mat = str(FT1pat.findall(r)[0])
                         
                     if not(loc_mat==[]):
                          my_locs.append(loc_mat)
                 
-                elif r[0:2]=='FT' and not(re.findall('FT\s*/accession="(MIMAT[0-9]*)"', r)==[]):
-                     mat_acc = str(re.findall('FT\s*/accession="(MIMAT[0-9]*)"', r)[0])
+                elif r[0:2]=='FT' and not(FT2pat.findall(r)==[]):
+                     mat_acc = str(FT2pat.findall(r)[0])
                         
                      if matACCs == '':
                          matACCs = mat_acc
                      if not(mat_acc == []):
                          my_accs.append(mat_acc)    
                                 
-                elif r[0:2]=='FT' and not(re.findall('FT\s*/product="(\S*)"', r)==[]):
-                     mat_id = str(re.findall('FT\s*/product="(\S*)"', r)[0])
+                elif r[0:2]=='FT' and not(FT3pat.findall(r)==[]):
+                     mat_id = str(FT3pat.findall(r)[0])
                         
                      if matIDs == '':
                          matIDs = mat_id
                                           
                 elif r[0:2]=='SQ':
             
-                     preSQ_INFO = str(re.findall('SQ\s*(.*other;)', r)[0])
+                     preSQ_INFO = str(SQpat.findall(r)[0])
                      seq = 'on'
             
                 elif r[0:2]=='  ' and seq == 'on':
-                     preSQ.append(str(re.findall('\s*([a-z\s]*)\s*\d*',r)[0]).replace(' ',''))
+                     preSQ.append(str(seqpat.findall(r)[0]).replace(' ',''))
                      
             ### cluster search
             clusters = ''
                 label = re.findall('/(\S{3,4}_\S{3}miRNA?)\.txt',filename)[0]
                 
                 if type_file == 'mat':
-                    serverFiles.upload("miRNA", label, filename, title="miRNA: %s mature form" % org, tags=["tag1", "tag2"])
+                    serverFiles.upload("miRNA", label, filename, title="miRNA: %s mature form" % org, tags=["miRNA"])
                     serverFiles.unprotect("miRNA", label)
                     print '%s mat uploaded' % org
                     
                         fastprint(miRNA_path,'a',file_line)                 
                     
                 elif type_file == 'pre':
-                    serverFiles.upload("miRNA", label, filename, title="miRNA: %s pre-form" % org, tags=["tag1", "tag2"])
+                    serverFiles.upload("miRNA", label, filename, title="miRNA: %s pre-form" % org, tags=["miRNA"])
                     serverFiles.unprotect("miRNA", label)
                     print '%s pre uploaded' % org
                     
                 else:
                     print 'Check the label.'
     
-    serverFiles.upload("miRNA", "miRNA.txt", miRNA_path)
+    serverFiles.upload("miRNA", "miRNA.txt", miRNA_path, title="miRNA: miRNA library", tags=["miRNA"] )
     serverFiles.unprotect("miRNA", "miRNA.txt")
     print '\nmiRNA.txt uploaded'
     
-    serverFiles.upload("miRNA", "premiRNA.txt", premiRNA_path)
+    serverFiles.upload("miRNA", "premiRNA.txt", premiRNA_path, title="miRNA: pre-form library", tags=["miRNA"])
     serverFiles.unprotect("miRNA", "premiRNA.txt")
     print 'premiRNA.txt uploaded\n'
 else: