Commits

tomazc  committed 8c98194

automatic detection of appropriate genome map

  • Participants
  • Parent commits 549b9ae

Comments (0)

Files changed (1)

File OWGenomeMap.py

 <priority>100</priority>
 """
 
-import orange, OWGUI, math
+import orange, OWGUI, math, glob
 import os.path # to find out where the local files are
 from OWWidget import *
 from qtcanvas import *
 
 localdir = os.path.dirname(__file__) or "."
 
+DEBUG = 0
+
 ##############################################################################
 # main class
 
 # z coordinates for graphical objects
 zchrom = 100; zchrombb=110; zgenes = 50; zticks = 40; zsel=10
 
+## chrom definition .tab file must have the following attributes (columns):
+#   geneID
+#   chromosome
+#   start
+#   stop
+#
+# chromosome definitions should be in the first lines
+# chromosome definitions are those entries with column start == 0
+
 class OWGenomeMap(OWWidget):
-    settingsList = ["MinGeneWidth", "ShowTicks", "ColorByClass"]
+    settingsList = ["MinGeneWidth", "ShowTicks", "ColorByClass", "RecentGenomeMaps"]
 
     def __init__(self, parent=None, name='GenomeMap'):
         OWWidget.__init__(self, parent, name, "Shows the locations of genes on the chromosomes.")
         self.MinGeneWidth = 5
         self.ColorByClass = 1
         self.ShowTicks = 1
+        self.RecentGenomeMaps = []
+        self.GenomeMapIndx = 0
+        self.genesInGenomeMapFile = {}
+        self.GenomeMapLoaded = 0
         self.loadSettings()
 
+        # received and decided based on input signal
+        self.candidateGeneIDsFromSignal = [] ## list of discrete attributes present in clusterSet data signal
+        self.geneIDattrIndx = 0 ## index of attribute in candidateGeneIDsFromSignal that was selected to represent the gene IDs
+        self.geneIDattr = None ## self.geneIDattr is set accordingly
+
+        # check if genome maps exist and remove those that don't
+        # check that all files in directories "Genome Map" are included in the list
+        self.RecentGenomeMaps = filter(os.path.exists, self.RecentGenomeMaps)
+        widgetDir = os.path.dirname(os.path.abspath(__file__)) + "/"
+        gmapList = glob.glob(widgetDir + 'Genome Map/*.tab')
+        for f in gmapList:
+            f = os.path.abspath(f)
+            if f not in self.RecentGenomeMaps:
+                self.RecentGenomeMaps.append( f)
+        genesInRecentGenomeMapFile = {}
+        #
+
         self.classColors = []
-        self.loadChromosomeDefinitions()
+        self.geneCoordinates = {}
         self.data = None
         self.mid = orange.newmetaid() # meta id for a marker if a gene has a known position
-        self.graph = ChromosomeGraph(self, self.chrom)
-        
+        self.graph = ChromosomeGraph(self)
+
         # inputs and outputs
         self.inputs=[("Examples", ExampleTable, self.dataset, 1)]
         self.outputs = [("Examples", ExampleTable), ("Classified Examples", ExampleTableWithClass)]
 
-
         # GUI definition
         box = QVButtonGroup("Graph Options", self.controlArea)
         OWGUI.qwtHSlider(box, self, "MinGeneWidth", label='Min. mark width: ', labelWidth=80, minValue=1, maxValue=10, step=1, callback=self.graph.repaintGenes)
         box=QBoxLayout(self.mainArea, QVBoxLayout.TopToBottom, 0)
         self.view = ChromosomeGraphView(self.graph, self.mainArea)
         self.view.setMinimumWidth(500)
-        self.view.setMargins(0, max([x[1] for x in self.chrom]))
         box.addWidget(self.view)
 
-    def loadChromosomeDefinitions(self):
-##        import os
-##        print 'PATH', os.getcwd()
-##        self.chromData = orange.ExampleTable(localdir+'/test-genome.tab')
-        self.chromData = orange.ExampleTable(localdir+'/Genome Maps/gene loci.tab')
-        self.chrom = []
+        box = QHButtonGroup("Genome Map", self.controlArea)
+        box.setMaximumSize(250, 50)
+        self.genomeMapCombo = OWGUI.comboBox(box, self, 'GenomeMapIndx', items=[], callback=self.loadGenomeMap)
+        self.genomeMapCombo.setMaximumSize(160, 20)
+        self.setFilelist(self.genomeMapCombo, self.RecentGenomeMaps)
+        self.genomeMapBrowse = OWGUI.button(box, self, 'Browse', callback=self.browseGenomeMap)
+        self.genomeMapBrowse.setMaximumSize(50, 30)
+
+        box = QHButtonGroup("Gene ID attribute", self.controlArea)
+        box.setMaximumSize(250, 50)
+        self.geneIDAttrCombo = OWGUI.comboBox(box, self, 'geneIDattrIndx', items=[], callback=self.geneIDchanged)
+        self.geneIDAttrCombo.setMaximumSize(160, 20)
+        self.setGeneIDAttributeList()
+
+    def geneIDchanged(self):
+        if len(self.candidateGeneIDsFromSignal) > self.geneIDattrIndx:
+            self.geneIDAttrCombo.setCurrentItem(self.geneIDattrIndx)
+            self.geneIDattr = self.candidateGeneIDsFromSignal[self.geneIDattrIndx]
+        else:
+            self.geneIDattr = None
+        if DEBUG: print "changing geneID attribute to: " + str(self.geneIDattr)
+        self.datasetChanged() ## recalculate the selected genes
+        self.geneIDAttrCombo.setDisabled(len(self.candidateGeneIDsFromSignal) == 0)
+
+    def setGeneIDAttributeList(self):
+        ## refresh the list
+        self.geneIDAttrCombo.clear()
+        for f in self.candidateGeneIDsFromSignal:
+            self.geneIDAttrCombo.insertItem(str(f.name))
+        self.geneIDAttrCombo.setDisabled(len(self.candidateGeneIDsFromSignal) == 0)
+
+    def setFilelist(self, filecombo, fileList):
+        filecombo.clear()
+        if fileList != []:
+            for file in fileList:
+                (dir, filename) = os.path.split(file)
+                #leave out the path
+                fnToDisp = filename
+                filecombo.insertItem(fnToDisp)
+            filecombo.setDisabled(False)
+        else:
+            filecombo.insertItem("(none)")
+            filecombo.setDisabled(True)
+
+    def loadChromosomeDefinitions(self, filename):
         self.geneCoordinates = {}
+        chrom = []
+        try:
+            chromData = orange.ExampleTable(filename, dontCheckStored=1)
+        except:
+            self.graph.chrom = []
+            self.GenomeMapLoaded = 0
+            return
         id2desc = {}
-        for d in self.chromData:
-            if int(d['start']) == 0:
-                self.chrom.append((int(d['start']), int(d['stop']), int(d['chromosome']), str(d['DDB'])))
-                id2desc[int(d['chromosome'])] = len(self.chrom)-1
-            else:
-                self.geneCoordinates[str(d['DDB'])] = (int(d['start']), int(d['stop']), id2desc[int(d['chromosome'])] )
+        geneIDs = [] # all geneIDs in this file
+        for d in chromData:
+            geneID = str(d['geneID'])
+            if int(d['start']) == 0: ## loading chromosomes definitions
+                chrom.append((int(d['start']), int(d['stop']), int(d['chromosome']), geneID))
+                id2desc[int(d['chromosome'])] = len(chrom)-1
+            else: ## loading genes positions
+                self.geneCoordinates[geneID] = (int(d['start']), int(d['stop']), id2desc[int(d['chromosome'])] )
+            geneIDs.append( geneID)
+        self.genesInGenomeMapFile[filename] = geneIDs ## update with new data (in case file has changed)
+        self.GenomeMapLoaded = 1
+        self.graph.chrom = chrom
+
+    def repaintChromeGraph(self):
+        self.view.resetMargins(0, max([10]+ [x[1] for x in self.graph.chrom]))
+        self.graph.setMargins()
+        self.graph.paint()
+
+    def loadGenomeMap(self, change=1):
+        if self.GenomeMapIndx < len(self.RecentGenomeMaps):
+            fn = self.RecentGenomeMaps[self.GenomeMapIndx]
+            if fn != "(none)":
+                # remember the recent file list
+                if fn in self.RecentGenomeMaps: # if already in list, remove it
+                    self.RecentGenomeMaps.remove(fn)
+                self.RecentGenomeMaps.insert(0, fn) # add to beginning of list
+                self.setFilelist(self.genomeMapCombo, self.RecentGenomeMaps) # update combo
+                self.loadChromosomeDefinitions(fn)
+                if change: self.datasetChanged() ## repaint
+
+    def browseGenomeMap(self):
+        if self.RecentGenomeMaps == []:
+            startfile = "."
+        else:
+            startfile = self.RecentGenomeMaps[0]
+        filename = QFileDialog.getOpenFileName(startfile, 'Genome Map files (*.tab)\nAll files(*.*)', None, 'Genome Map File')
+        fn = str(filename)
+        fn = os.path.abspath(fn)
+        if fn in self.RecentGenomeMaps: # if already in list, remove it
+            self.RecentGenomeMaps.remove(fn)
+        self.RecentGenomeMaps.insert(0, fn)
+        self.GenomeMapIndx = 0
+        self.loadGenomeMap()
+
+    def findMostAppropriateGeneIDandGenomeMap(self):
+        if self.data == None:
+            self.candidateGeneIDsFromSignal = []
+            self.geneIDattrIndx = -1
+            self.geneIDattr = None
+            self.setGeneIDAttributeList()
+            return
+
+        ## all discrete and string type attributes are good candidates
+        self.candidateGeneIDsFromSignal = [a for a in self.data.domain.attributes + self.data.domain.getmetas().values() if a.varType == orange.VarTypes.Discrete or a.varType == orange.VarTypes.Other]
+        self.setGeneIDAttributeList()
+        self.geneIDAttrCombo.setDisabled(1)
+
+        ## check if there are new genome map files present
+        ## remove from geneID2genomeMapfile those not present in the RecentGenomeMaps list
+        ## geneID is key, item is list of indexes in self.RecentGenomeMaps that have that geneID
+        geneID2genomeMapfile = {}
+        cn = 0
+        for f in self.RecentGenomeMaps:
+            if f not in self.genesInGenomeMapFile.keys():
+                if DEBUG: print "loading", f
+                chromData = orange.ExampleTable(f, dontCheckStored=1)
+                geneIDs = [str(d['geneID']) for d in chromData] # all geneIDs in this file
+                self.genesInGenomeMapFile[f] = geneIDs # update with new data (in case file has changed)
+            for geneID in self.genesInGenomeMapFile[f]:
+                tmpl = geneID2genomeMapfile.get(geneID, [])
+                if cn not in tmpl:
+                    tmpl.append(cn)
+                    geneID2genomeMapfile[geneID] = tmpl
+            cn += 1
+
+        ## for each attribute look how many genesID are there, that are also present in geneID2genomeMapfile
+        ## if current self.geneIDattr has count 0
+        ## then select attribute with highest count
+        ## else keep self.geneIDattr
+
+        ## when best attribute selected, check if the loaded genome map is ok
+        ## otherwise suggest the most appropriate genome map
+        bestAttr = '' ## key is attribute, item is number of recognized geneIDs
+        bestCn = 0
+        bestGenomeMap = 0
+        lst = self.candidateGeneIDsFromSignal
+        if self.geneIDattr <> None and self.geneIDattr in self.candidateGeneIDsFromSignal: lst = [self.geneIDattr] + lst
+
+        for attr in lst:
+            vals = [ex[attr] for ex in self.data]
+
+            ## calculate the frequency of each annotation file to which this geneID belongs to
+            genomeMapFrequency = {}
+            cn = 0
+            for v in vals:
+                v = str(v)
+                i = geneID2genomeMapfile.get(v, -1) ## -1, not present
+                if i <> -1:
+                    for ai in i:
+                        af = genomeMapFrequency.get(ai, 0)
+                        genomeMapFrequency[ai] = af + 1
+                    cn += 1
+            if cn > bestCn or (cn > 0 and attr == self.geneIDattr):
+                bestAttr = attr
+                bestCn = cn
+                gmfs = [(f, gmindex) for (gmindex, f) in genomeMapFrequency.items()]
+                if len(gmfs) > 0:
+                    gmfs.sort()
+                    gmfs.reverse() ## most frequent first
+                    bestGenomeMap = gmfs[0][1]
+                else:
+                    bestGenomeMap = 0 ## keep current
+        if DEBUG: print "best attribute: " + str(bestAttr) + " with " + str(bestCn) + " gene IDs from genome map"
+        if DEBUG: print "bestGenomeMap: " + str(self.RecentGenomeMaps[bestGenomeMap])
+
+        self.geneIDattr = bestAttr
+        try:
+            self.geneIDattrIndx = self.candidateGeneIDsFromSignal.index(self.geneIDattr)
+        except:
+            self.geneIDattrIndx = 0
+
+        ## load annotation if a better one found
+        if bestGenomeMap <> 0 or not(self.GenomeMapLoaded):
+            self.GenomeMapIndx = bestGenomeMap
+            self.loadGenomeMap(0)
+##            self.loadChromosomeDefinitions(self.RecentGenomeMaps[self.GenomeMapIndx])
+
+        ## select the geneID, and rerun the GO term finding
+        if DEBUG: print "geneID changed"
+        self.geneIDchanged()
 
     def dataset(self, data):
         self.data = data
+        self.findMostAppropriateGeneIDandGenomeMap() ## select most appropriate attribute only when first receiving the signal
+        self.graph.selection = []
+        self.datasetChanged()
 
+    def datasetChanged(self):
+        if self.geneIDattr == None:
+            self.coord = []
+            if self.data:
+                for (i,d) in enumerate(self.data):
+                    d[self.mid] = 0
+            self.repaintChromeGraph() ## paint empty graph
+            return
+
+        ## in chrome data mark those records
+        ## where the geneID matches that from geneCoordinates
         if self.data:
-            mid = self.mid
-            ### XXX issue a warning if not found
-            found = 0
-            metas = self.data.domain.getmetas()
-            for rmiIndx in metas.keys():
-                if metas[rmiIndx].name == 'DDB':
-                    found = 1
-                    break
-            if not found:
-                print 'XXX warning: DDB not found in data set'
-                return
+            ## make a self.coord the same size as input signal data
+            self.coord = [None] * len(self.data)
+            for (i,d) in enumerate(self.data):
+                geneID = str(d[str(self.geneIDattr.name)])
+                if self.geneCoordinates.has_key(geneID):
+                    self.coord[i] = self.geneCoordinates[geneID]
+                    d[self.mid] = 1
+                else:
+                    ### XXX issue a warning
+                    d[self.mid] = 0
+##                    print 'no key for', geneID
 
-            coord = self.geneCoordinates
-            self.coord = [None] * len(data)
-            if self.data:
-                for (i,d) in enumerate(data):
-    ##                rmi = str(d.getmeta(rmiIndx))    # XXX change to this once bug is removed
-                    rmi = str(d['DDB'])
-                    if coord.has_key(rmi):
-                        self.coord[i] = coord[rmi]
-                        d[mid] = 1
-                    else:
-                        ### XXX issue a warning
-                        d[mid] = 0
-                        print 'no key for', rmi
-            else:
-                pass
+            ## create color map
+            self.colorByClassCB.setDisabled(self.data.domain.classVar == None)
+            if self.data.domain.classVar:
+                self.classColors = []
+                for i in range(len(self.data.domain.classVar.values)):
+                    newColor = QColor()
+                    newColor.setHsv(i*360/len(self.data.domain.classVar.values), 255, 255)
+                    self.classColors.append(newColor)
+            self.repaintChromeGraph()
 
-            self.colorByClassCB.setDisabled(data.domain.classVar == None)
-            if data.domain.classVar:
-                self.classColors = []
-                for i in range(len(data.domain.classVar.values)):
-                    newColor = QColor()
-                    newColor.setHsv(i*360/len(data.domain.classVar.values), 255, 255)
-                    self.classColors.append(newColor)
-
-        self.graph.paint()
-        
 ##############################################################################
 # graph with chromosomes and genes
 
 ##gSelectionColor = Qt.yellow
 
 class ChromosomeGraph(QCanvas):
-    def __init__(self, parent, chrom):
+    def __init__(self, parent, chrom = []):
         apply(QCanvas.__init__,(self, parent, ""))
         self.parent = parent
         self.chrom = chrom
         self.bpL, self.bpR = view.margins[-1]
         self.bpW = float(self.bpR - self.bpL)
         self.ticks = []
+        ## find longest chrome
         for c in self.chrom:
-            self.ticks.append(self.getTicks(max(self.bpL, c[0]), min(self.bpR, c[1])))
+            self.ticks.append(self.getTicks(max(self.bpL, c[0]), min(self.bpR, c[1]), self.bpL, self.bpR))
 
     # converts bp index to canvas position
     def bp2x(self, bp):
 
     def paint(self):
         view = self.parent.view
-        self.resize(view.width()-20, max(view.height()-5, yoffset+(len(self.chrom)+1)*yspace+yoffset))
+        self.resize(view.width()-20, max(view.height()-5, yoffset+(len(self.chrom)+1)*(ychrom+yspace) - yspace))
         self.gwidth = self.width() - 2*xoffset
 
         # remove everything on present canvas
                     diff = int((self.parent.MinGeneWidth - (rp-lp)) / 2)
                     lp, rp = max(lp-diff, lborder), min(rp+diff, rborder)
                 y = yoffset + coord[2]*(ychrom+yspace)
-                r = QCanvasRectangle(lp, y, rp-lp, ychrom+1, self)
+                r = QCanvasRectangle(lp, y, max(self.parent.MinGeneWidth, rp-lp), ychrom+1, self)
                 if colorclass:
                     color = colors[int(d.getclass())]
                 else:
                 r.setBrush(QBrush(gSelectionColor)); r.setPen(QPen(gSelectionColor))
                 r.show()
         
-    def getTicks(self, lower, upper):
+    def getTicks(self, lower, upper, abslower, absupper):
         ideal = (upper-lower)/2.
+        absideal = (absupper - abslower)/2
+        ideal = max(absideal /4.0, ideal) ## don't display a too fine scale if the rest is quite big
         if ideal<=0:
             return []
 ##        print 'iii %d (%d-%d)' % (ideal, upper, lower)
         if self.canvas():
             self.canvas().paint()
 
+    def resetMargins(self, left, right):
+        self.margins = []
+        self.setMargins(left, right)
+
     def setMargins(self, left, right):
-        self.margins.append((left, right))
+        self.margins.append((max(0, left), right))
         self.canvas().setMargins()
 
     def retractMargins(self):
 
     def contentsMouseMoveEvent(self, event):
         x = event.pos().x()
-        if self.zoomStart:
+        if self.zoomStart <> None:
             self.drawZoom(self.zoomStart, x)
-        elif self.selStart:
+        if self.selStart <> None:
             self.drawSel(self.selStart, x)
 
     def contentsMouseReleaseEvent(self, event):
         x = event.pos().x()
-        if self.zoomStart:
-            left = self.canvas().x2bp(min(self.zoomStart, x))
-            right = self.canvas().x2bp(max(self.zoomStart, x))
+        if self.zoomStart <> None:
+            zoomStart = self.zoomStart ## remember is locally, and set it to None right away
+            self.zoomStart = None
+            left = self.canvas().x2bp(min(zoomStart, x))
+            right = self.canvas().x2bp(max(zoomStart, x))
             if abs(right-left) < 50:
                 self.zoomRect.setCanvas(None)
                 self.canvas().update()
             else:
                 self.setMargins(left, right)
                 self.canvas().paint()
-            self.zoomStart = None
-        elif self.selStart:
-            self.canvas().addSelection(self.selStart, x, self.chrom.id, self.selRect, replace=not self.addSelection)
-            self.selStart = None
+        if self.selStart <> None:
+            selStart = self.selStart ## remember is locally, and set it to None right away
+            self.selStart = None     ## otherwise the selection continues when other widgets process
+            self.canvas().addSelection(selStart, x, self.chrom.id, self.selRect, replace=not self.addSelection)
 
     def keyPressEvent(self, e):
         self.shiftPressed = e.key() == 4128