Commits

Aleš Erjavec  committed 50499d1

Changed the Organism.gene_aliases method.

  • Participants
  • Parent commits 91d14dd

Comments (0)

Files changed (3)

File _bioinformatics/obiGene.py

         self.ignore_case = ignore_case
         self.filename() # test if valid filename can be built
 
-from Orange.utils import ConsoleProgressBar
 
 class MatcherAliasesKEGG(MatcherAliasesPickled):
 
         return obiKEGG.organism_name_search(organism)
 
     def create_aliases(self):
-        organism = self._organism_name(self.organism)
         org = obiKEGG.KEGGOrganism(self.organism, genematcher=GMDirect())
-        osets = org._gm_gene_aliases()
+        osets = org.gene_aliases()
         return osets
 
     def create_aliases_version(self):
-        return obiKEGG.KEGGOrganism.organism_version(self.organism) + ".1"
+        # use KEGG short release string (e.g. '66.0+')
+        release = obiKEGG.KEGGOrganism.organism_version(self.organism) + ".2"
+        release, _ = release.split("/")
+        return release
 
     def filename(self):
-        return "kegg_2_" + self._organism_name(self.organism) 
+        return "kegg_2_" + self._organism_name(self.organism)
 
     def __init__(self, organism, ignore_case=True):
         self.organism = organism
         MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
 
+
 class MatcherAliasesFile(MatcherAliasesPickled):
 
     def create_aliases(self):

File _bioinformatics/obiKEGG/__init__.py

 import urllib2
 
 from collections import defaultdict
-
+from itertools import chain
 from datetime import datetime
 
 from Orange.utils import lru_cache
 
     def gene_aliases(self):
         """
-        Return known gene aliases (synonyms in other databases).
+        Return a list of sets of equal genes (synonyms) in KEGG for
+        this organism.
+
+        .. note::
+
+            This only includes 'ncbi-geneid' and 'ncbi-gi' records
+            from the KEGG Genes DBLINKS entries.
+
         """
-        return self.genes.gene_aliases()
+        definitions = self.api.list(self.org_code)
+        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
+        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
+
+        aliases = defaultdict(set)
+
+        for entry_id, definition in definitions:
+            # genes entry id without the organism code
+            aliases[entry_id].add(entry_id.split(":", 1)[1])
+            # all names in the NAME field (KEGG API list returns
+            # 'NAME; DEFINITION') fields for genes
+            names = definition.split(";")[0].split(",")
+            aliases[entry_id].update([name.strip() for name in names])
+
+        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
+            aliases[target_id].add(source_id.split(":", 1)[1])
+
+        return [set([entry_id]).union(names)
+                for entry_id, names in aliases.iteritems()]
 
     def pathways(self, with_ids=None):
         """
     def enzymes(self, genes=None):
         raise NotImplementedError()
 
-    def _gm_gene_aliases(self):
-        """
-        Return a list of sets of equal genes. This is a hack for
-        gene matchers to work faster until the whole implementations
-        transitions to REST. Does not include links to DBs.
-        """
-        s1 = urllib2.urlopen("http://rest.kegg.jp/list/%s" % self.org_code).read()
-        out = []
-        for l in s1.split('\n'):
-            if l:
-                tabs = l.split("\t")
-                cset = set([tabs[0]])
-
-                if ":" in tabs[0]:
-                    # also add 'identifier' from 'org_code:identifier'
-                    cset.add(tabs[0].split(":", 1)[-1])
-
-                try:
-                    rest = tabs[1].split(";")[0]
-                    cset |= set(rest.split(", "))
-                except:
-                    pass  # do not crash if a line does not conform
-                out.append(cset)
-        return out
-
     def get_enriched_pathways(self, genes, reference=None,
                               prob=obiProb.Binomial(), callback=None):
         """

File _bioinformatics/obiKEGG/api.py

 
         return self.service.get(ids).get()
 
-    def conv(self, ids):
-        raise NotImplementedError()
+    def conv(self, target_db, source):
+        """
+        Return a mapping from source to target_db ids as a list of two
+        tuples [(source_id, target_id), ...].
+
+        """
+        if not isinstance(source, basestring):
+            source = "+".join(source)
+
+        res = self.service.conv(target_db)(source).get()
+        return [tuple(line.split("\t")) for line in res.splitlines()]
 
     def link(self, target_db, source_db=None, ids=None):
         if not (source_db or ids):
         return rval
 
     @cached_method
-    def conv(self, ids):
-        return KeggApi.conv(self, ids)
+    def conv(self, target_db, source):
+        return KeggApi.conv(self, target_db, source)
 
     ########
     # LinkDB