Anonymous avatar Anonymous committed 549b9ae

automatic detection of appropriate annotation file and attribute with geneID values

Comments (0)

Files changed (1)

OWGOTermFinder.py

 <priority>100</priority>
 """
 
-import orange, math
+import orange, math, glob
 import GOlib ## function needed to handle the GO and annotation
 import OWGUI
 from qt import *
 from qttable import *
 from qwt import *
 
+DEBUG = 0
+
 class OWGOTermFinder(OWWidget):	
     settingsList = ["AnnotationFileName", "RecentAnnotations", "ReferenceType", "RecentGOaspects",
                     "FilterNumEnabled", "FilterNumValue", "FilterPvalEnabled", "FilterPvalue", "FilterDepthEnabled", "FilterDepthValue",
         self.AnnotationFileName = self.GOaspectFileName = None # these are names of files
         self.RecentAnnotations = []
         self.BAnnotationIndx = 0
+        self.genesInAnnotationFile = {}
         # reference
         self.ReferenceType = 0 ## get the reference from the annotation
         # GO
         self.loadSettings()
         self.data = None
         # check if files exist and remove those that don't
+        # check that all files in directories "Annotation" and "GO" are already included
         self.RecentAnnotations = filter(os.path.exists, self.RecentAnnotations)
         self.RecentGOaspects = filter(os.path.exists, self.RecentGOaspects)
+        widgetDir = os.path.dirname(os.path.abspath(__file__)) + "/"
+        ## add all annotations in "./Annotation" directory
+        annotList = glob.glob(widgetDir + 'Annotation/*.annotation')
+        for f in annotList:
+            if f not in self.RecentAnnotations:
+                self.RecentAnnotations.append( f)
+        genesInAnnotationFile = {}
+
+        ## add all GOs in "./GO" directory
+        GOlist = glob.glob(widgetDir + 'GO/*.go')
+        for f in GOlist:
+            if f not in self.RecentGOaspects:
+                self.RecentGOaspects.append( f)
 
         # tmp structures - loaded by user
         self.annotation = None
         self.GO = None
 
         # received by signals
-        self.geneNameAttr = 'DDB' # name of attribute from where to take the gene ID or name
+        self.candidateGeneIDsFromSignal = [] ## list of discrete attributes present in clusterSet data signal
+        self.BgeneIDattrIndx = -1 ## index of attribute in candidateGeneIDsFromSignal that was selected to represent the gene IDs
+        self.geneIDattr = None ## self.geneIDattr is set accordingly
         # should read from 'GeneName' column in input signal "Examples"
         self.clusterGenes = [] ## ['YPD1', 'WHI4', 'SHS1', 'GCS1', 'HO', 'YDL228C', 'SSB1', 'PTP1', 'BRE4', 'OST4', 'YDL233W', 'GYP7']
         self.clusterData = None
         # should read from 'GeneName' column in input signal "Examples Reference"
-        self.referenceGenes = []
+        self.referenceGenes = None
+        self.referenceData = None
 
         # calculated from tmp structures and received signals
         # updated by filters
         self.setFilelist(self.GOaspectCombo, self.RecentGOaspects)
         self.GOaspectBrowse = OWGUI.button(box, self, 'Browse', callback=self.browseGOaspect)
         self.GOaspectBrowse.setMaximumSize(50, 30)
+        # gene name attribute
+        box = QHButtonGroup("Gene ID attribute", self.inputTab)
+        box.setMaximumSize(250, 50)
+        self.geneIDAttrCombo = OWGUI.comboBox(box, self, 'BgeneIDattrIndx', items=[], callback=self.geneIDchanged)
+        self.geneIDAttrCombo.setMaximumSize(160, 20)
+        self.setGeneIDAttributeList()
         self.tabs.insertTab(self.inputTab, "Input")
 
         # FILTER TAB
 
         self.resize(1000, 800)
         self.layout.activate() # this is needed to scale the widget correctly
-        self.loadAnnotation()
-        self.loadGOaspect()
+
+    def geneIDchanged(self):
+        if len(self.candidateGeneIDsFromSignal) > self.BgeneIDattrIndx:
+            self.geneIDAttrCombo.setCurrentItem(self.BgeneIDattrIndx)
+            self.geneIDattr = self.candidateGeneIDsFromSignal[self.BgeneIDattrIndx]
+        else:
+            self.geneIDattr = None
+        if DEBUG: print "changing geneID attribute to: " + str(self.geneIDattr)
+        self.clusterDatasetChanged()
+        self.referenceDatasetChanged()
+        self.findTermsBuildDAG()
+        self.geneIDAttrCombo.setDisabled(len(self.candidateGeneIDsFromSignal) == 0)
+
+    def setGeneIDAttributeList(self):
+        ## refresh the list
+        self.geneIDAttrCombo.clear()
+        for f in self.candidateGeneIDsFromSignal:
+            self.geneIDAttrCombo.insertItem(str(f.name))
+        self.geneIDAttrCombo.setDisabled(len(self.candidateGeneIDsFromSignal) == 0)
+
+    def updateGeneID2annotationfile(self):
+        ## geneID is key, item is list of indexes in self.RecentAnnotations that have that geneID
+        self.geneID2annotationfile = {}
+        cn = 0
+        for f in self.RecentAnnotations:
+            if f not in self.genesInAnnotationFile.keys():
+                loadedAnnotation = cPickle.load(open(f, 'r'))
+                self.genesInAnnotationFile[f] = loadedAnnotation['gene2GOID'].keys()
+            genesInAnnotation = self.genesInAnnotationFile[f]
+            for geneID in genesInAnnotation:
+                tmpl = self.geneID2annotationfile.get(geneID, [])
+                if cn not in tmpl:
+                    tmpl.append(cn)
+                    self.geneID2annotationfile[geneID] = tmpl
+            cn += 1
+
+    ## this is called only when new data token is received
+    def findMostAppropriateGeneIDandAnnotation(self):
+        if self.clusterData == None:
+            self.candidateGeneIDsFromSignal = []
+            self.BgeneIDattrIndx = -1
+            self.geneIDattr = None
+            self.setGeneIDAttributeList()
+            return
+
+        ## all discrete and string type attributes are good candidates
+        self.candidateGeneIDsFromSignal = [a for a in self.clusterData.domain.attributes + self.clusterData.domain.getmetas().values() if a.varType == orange.VarTypes.Discrete or a.varType == orange.VarTypes.Other]
+        self.setGeneIDAttributeList()
+        self.geneIDAttrCombo.setDisabled(1)
+
+        ## check if there are new annotation files present
+        ## remove from self.geneID2annotationfile those not present in the RecentAnnotations list
+        self.updateGeneID2annotationfile() 
+
+        ## for each attribute look how many genesID are there, that are also present in geneID2annotationfile
+        ## if current self.geneIDattr has count 0
+        ## then select attribute with highest count
+        ## else keep self.geneIDattr
+
+        ## when best attribute selected, check if the loaded annotation is ok
+        ## otherwise suggest the most appropriate annotation
+        bestAttr = '' ## key is attribute, item is number of recognized geneIDs
+        bestCn = 0
+        bestAnnotation = 0
+        lst = self.candidateGeneIDsFromSignal
+        if self.geneIDattr <> None and self.geneIDattr in self.candidateGeneIDsFromSignal: lst = [self.geneIDattr] + lst
+
+        for attr in lst:
+            vals = [ex[attr] for ex in self.clusterData]
+
+            ## calculate the frequency of each annotation file to which this geneID belongs to
+            annotationFrequency = {}
+            cn = 0
+            for v in vals:
+                v = str(v)
+                i = self.geneID2annotationfile.get(v, -1) ## -1, not present
+                if i <> -1:
+                    for ai in i:
+                        af = annotationFrequency.get(ai, 0)
+                        annotationFrequency[ai] = af + 1
+                    cn += 1
+            if cn > bestCn or (cn > 0 and attr == self.geneIDattr):
+                bestAttr = attr
+                bestCn = cn
+                afs = [(f, anindex) for (anindex, f) in annotationFrequency.items()]
+                if len(afs) > 0:
+                    afs.sort()
+                    afs.reverse() ## most frequent first
+                    bestAnnotation = afs[0][1]
+                else:
+                    bestAnnotation = 0 ## keep current
+        if DEBUG: print "best attribute: " + str(bestAttr) + " with " + str(bestCn) + " gene IDs from annotations"
+        if DEBUG: print "bestAnnotation: " + str(self.RecentAnnotations[bestAnnotation])
+
+        self.geneIDattr = bestAttr
+        try:
+            self.BgeneIDattrIndx = self.candidateGeneIDsFromSignal.index(self.geneIDattr)
+        except:
+            self.BgeneIDattrIndx = 0
+
+        ## load annotation if a better one found
+        if bestAnnotation <> 0 or self.annotation == None:
+            self.BAnnotationIndx = bestAnnotation
+            self.annotation, self.BAnnotationIndx = self.loadRemember(self.RecentAnnotations, self.annotationCombo, self.BAnnotationIndx)
+            fn = self.RecentAnnotations[0] ## the loaded one is (becomes) always moved to 0 position
+            self.genesInAnnotationFile[fn] = self.annotation['gene2GOID'].keys() ## update, in case the file content changed
+##            self.annotationCombo.setCurrentItem(self.BAnnotationIndx)
+
+        ## select the geneID, and rerun the GO term finding
+
+        self.geneIDchanged()
 
     def setFilelist(self, filecombo, fileList):
         filecombo.clear()
     ##########################################################################
     # handling of input/output signals
     def clusterDataset(self, data, id):
+        self.clusterData = data
+        self.findMostAppropriateGeneIDandAnnotation()
+        self.clusterDatasetChanged()
+        self.loadGOaspect(forced=0) ## usually only at the first run, the GO aspect is not loaded
+##        self.findTermsBuildDAG()
+
+    def clusterDatasetChanged(self):
         self.clusterGenes = []
-        self.clusterData = data
-        if data:
-            dattrs = [str(a.name) for a in data.domain.attributes + data.domain.getmetas().values()]
-##            print self.geneNameAttr in dattrs, dattrs
-            if self.geneNameAttr in dattrs:
-                for e in data:
-                    g = str(e[self.geneNameAttr])
+        if self.clusterData <> None:
+            if DEBUG: print "clusterDatasetChanged, self.geneIDattr: " + str(self.geneIDattr)
+            if self.geneIDattr in self.candidateGeneIDsFromSignal:
+                for e in self.clusterData:
+                    g = str(e[self.geneIDattr])
                     if g not in self.clusterGenes:
                         self.clusterGenes.append( g)
-##            print len(self.clusterGenes), self.clusterGenes[:10]
+        if DEBUG: print "input cluster genes: " + str(len(self.clusterGenes))
+        ## self.findTermsBuildDAG() need to call it, if you call clusterDatasetChanged directly
+
+    def referenceDataset(self, data, id):
+        self.referenceGenes = None
+        self.referenceData = data
+        self.referenceDatasetChanged()
         self.findTermsBuildDAG()
 
-    def referenceDataset(self, data, id):
-        self.referenceGenes = []
-        if data:
-            dattrs = [str(a.name) for a in data.domain.attributes + data.domain.getmetas().values()]
-            if self.geneNameAttr in dattrs:
+    def referenceDatasetChanged(self):
+        if DEBUG: print "reference: " + str(self.referenceData)
+        if self.referenceData <> None:
+            self.referenceGenes = []
+            dattrs = [a for a in self.referenceData.domain.attributes + self.referenceData.domain.getmetas().values()]
+            if self.geneIDattr in dattrs:
                 for e in data:
-                    g = str(e[self.geneNameAttr])
+                    g = str(e[self.geneIDattr])
                     if g not in self.referenceGenes:
                         self.referenceGenes.append( g)
-        self.findTermsBuildDAG()
+        else:
+            self.referenceGenes = None
+        ## self.findTermsBuildDAG() need to call it, if you call referenceDatasetChanged directly
 
     def tableSelectionChanged(self):
         for i in range(len(self.significantGOIDs)):
             # new exampletable into where to put the filtered examples
             newdata = orange.ExampleTable(newdomain)
             for e in self.clusterData:
-                g = str(e[self.geneNameAttr])
+                g = str(e[self.geneIDattr])
                 geneTermList = geneToGOterm.get(g, [])
                 if self.SelectDisjoint and len(geneTermList) > 1: ## this gene should be omitted, because belongs to many GOterms
                     continue
                 lst.insert(0, fn) # add to beginning of list
                 self.setFilelist(filecombo, lst) # update combo
                 loadedData = cPickle.load(open(fn, 'r'))
-        return loadedData
+                indx = 0
+        return loadedData, indx
 
     def browseAnnotation(self):
         self.browseRemember(self.RecentAnnotations, self.BAnnotationIndx, self.loadAnnotation, 'Annotation files (*.annotation)\nAll files(*.*)', 'Annotation Pickle File')
+        self.BAnnotationIndx = 0
 
     def loadAnnotation(self):
-        self.annotation = self.loadRemember(self.RecentAnnotations, self.annotationCombo, self.BAnnotationIndx)
+        if DEBUG: print "loadAnnotation"
+        self.annotation, self.BAnnotationIndx = self.loadRemember(self.RecentAnnotations, self.annotationCombo, self.BAnnotationIndx)
+        fn = self.RecentAnnotations[0] ## the loaded one is (becomes) always moved to 0 position
+        self.genesInAnnotationFile[fn] = self.annotation['gene2GOID'].keys() ## update, in case the file content changed
         self.updateEvidences()
         self.findTermsBuildDAG()
 
     def browseGOaspect(self):
         self.browseRemember(self.RecentGOaspects, self.BGOaspectIndx, self.loadGOaspect, 'GO files (*.go)\nAll files(*.*)', 'Gene Ontology Pickle File')
+        self.BGOaspectIndx = 0
 
-    def loadGOaspect(self):
-        self.GO = self.loadRemember(self.RecentGOaspects, self.GOaspectCombo, self.BGOaspectIndx)
-        self.updateEvidences()
-        self.findTermsBuildDAG()
+    def loadGOaspect(self, forced=1):
+        if DEBUG: print "loadGOaspect"
+        ## load if forced, or if index has changed
+        ## if forced = 0 and index has not changed (still is 0) then don't reload the annotation data
+        if forced == 1 or self.BGOaspectIndx <> 0 or self.GO == None:
+            print "1:", str(self.RecentGOaspects) + "," + str(self.BGOaspectIndx)
+            self.GO, self.BGOaspectIndx = self.loadRemember(self.RecentGOaspects, self.GOaspectCombo, self.BGOaspectIndx)
+            print "2:", str(self.RecentGOaspects) + "," + str(self.BGOaspectIndx)
+            self.updateEvidences()
+            self.findTermsBuildDAG()
 
     def updateEvidences(self):
         if not(self.annotation) or not(self.GO): ## if data missing, just disable everything
         self.updateDAG()
 
     def findTermsBuildDAG(self):
-        if self.annotation and self.GO:
+        self.dag = {}
+        if DEBUG: print "findTermsBuildDAG, self.annotation: " + str(self.annotation <> None)
+        if DEBUG: print "findTermsBuildDAG, self.GO: " + str(self.GO <> None)
+        if self.annotation <> None and self.GO <> None:
             self.progressBarInit()
             evidences = [etype for (etype, tmpCB) in self.evidenceCheckBoxes.items() if tmpCB.isChecked()]
             if self.ReferenceType == 0: # from annotation
     ow = OWGOTermFinder()
     a.setMainWidget(ow)
 
-    d = orange.ExampleTable('testClusterSet.tab', dontCheckStored=1)
-    d = orange.ExampleTable('hjSmall.tab', dontCheckStored=1)
-##    d = orange.ExampleTable('hj.tab', dontCheckStored=1)
+##    d = orange.ExampleTable('testClusterSet.tab', dontCheckStored=1)
+##    d = orange.ExampleTable('hjSmall.tab', dontCheckStored=1)
+    d = orange.ExampleTable('hj.tab', dontCheckStored=1)
     ow.clusterDataset(d, 0)
     ow.show()
     a.exec_loop()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.