Marko Toplak avatar Marko Toplak committed 4c4266f

Removed the old version of obiKEGG. Renamed obiKEGG2 -> obiKEGG.

Comments (0)

Files changed (32)

_bioinformatics/obiGO.py

             sf = orngServerFiles.ServerFiles()
             available = sf.listfiles("GO")
             if file not in available:
-                from . import obiKEGG2
-                raise obiKEGG2.OrganismNotFoundError(org + str(code))
+                from . import obiKEGG
+                raise obiKEGG.OrganismNotFoundError(org + str(code))
             orngServerFiles.download("GO", file)
 
         return cls(path, ontology=ontology, genematcher=genematcher, progressCallback=progressCallback)

_bioinformatics/obiGene.py

 from Orange.orng import orngServerFiles
 
 from . import obiTaxonomy
+from . import obiKEGG
 
 default_database_path = orngServerFiles.localpath("NCBI_geneinfo")
 
 class MatcherAliasesKEGG(MatcherAliasesPickled):
 
     def _organism_name(self, organism):
-        from . import obiKEGG2
-        return obiKEGG2.organism_name_search(organism)
+        return obiKEGG.organism_name_search(organism)
 
     def create_aliases(self):
         organism = self._organism_name(self.organism)
-        from . import obiKEGG2
-        org = obiKEGG2.KEGGOrganism(self.organism, genematcher=GMDirect())
+        org = obiKEGG.KEGGOrganism(self.organism, genematcher=GMDirect())
         osets = org._gm_gene_aliases()
         return osets
 
     def create_aliases_version(self):
-        from . import obiKEGG2
-        return obiKEGG2.KEGGOrganism.organism_version(self.organism) + ".1"
+        return obiKEGG.KEGGOrganism.organism_version(self.organism) + ".1"
 
     def filename(self):
         return "kegg_2_" + self._organism_name(self.organism) 

_bioinformatics/obiGeneSets.py

     """
     Returns gene sets from KEGG pathways.
     """
-    from . import obiKEGG2 as obiKEGG
     
     kegg = obiKEGG.KEGGOrganism(org)
 
     """
     Return gene sets from miRNA targets
     """
-    from . import obimiRNA, obiKEGG2
-    org_code = obiKEGG2.from_taxid(org)
+    from . import obimiRNA
+    org_code = obiKEGG.from_taxid(org)
     link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
     mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
     genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
     """
     orngServerFiles.update_local_files()
 
-    from . import obiKEGG2 as obiKEGG
-
     genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
     organisms = obiTaxonomy.common_taxids()
     for fn in genesetsfn:

_bioinformatics/obiGenomicsUpdate.py

         print "Updating GO anotations for", name
         pkgUpdate.UpdateAnnotation(org)
 
-    pkgUpdate = obiGenomicsUpdate.PKGUpdate("kegg", obiKEGG.Update())
-
-    print "Updating KEGG taxonomy"
-    pkgUpdate.UpdateTaxonomy()
-
-    print "Updating KEGG orthology"
-    pkgUpdate.UpdateOrthology()
-
-    print "Updating KEGG reference pathways"
-    pkgUpdate.UpdateReference()
-
-    for org, name in [("hsa", "Homo sapiens"), ("sce", "Yeast")]:
-        print "Updating KEGG pathways for", name
-        pkgUpdate.UpdateOrganism(org)
-    
-
 def firstUpdateQt():
     pass
     

_bioinformatics/obiKEGG/__init__.py

+"""\
+==============================================
+KEGG - Kyoto Encyclopedia of Genes and Genomes
+==============================================
+
+This is a python module for access to `KEGG`_ using its web services. To use this module you need to have
+`SUDS`_ python library installed (other backends are planed). 
+
+.. _`KEGG`: http://www.genome.jp/kegg/
+
+.. _`SUDS`: http://pypi.python.org/pypi/suds/
+
+"""
+from __future__ import absolute_import
+
+import urllib2
+import os, sys
+from collections import defaultdict
+
+from datetime import datetime
+
+from Orange.utils import lru_cache
+
+from . import databases
+from . import entry
+
+from .brite import BriteEntry, Brite
+
+from . import api
+from . import conf
+from . import pathway
+
+KEGGGenome = databases.Genome
+KEGGGenes = databases.Genes
+KEGGEnzymes = databases.Enzymes
+KEGGReaction = databases.Reactions
+KEGGPathways = databases.Pathways
+
+KEGGBrite = Brite
+KEGGBriteEntry = BriteEntry
+
+KEGGPathway = pathway.Pathway
+
+DEFAULT_CACHE_DIR = conf.params["cache.path"]
+
+
+from .. import obiProb
+from Orange.utils import deprecated_keywords, deprecated_attribute
+
+class OrganismNotFoundError(Exception): pass
+
+class Organism(object):
+    def __init__(self, org, genematcher=None):
+        self.org_code = self.organism_name_search(org)
+        self.genematcher = genematcher
+        self.api = api.CachedKeggApi()
+        
+    @property
+    def org(self):
+        return self.org_code
+    
+    @property
+    def genes(self):
+        if not hasattr(self, "_genes"):
+            genes = KEGGGenes(self.org_code)
+            self._genes = genes
+        return self._genes
+    
+    def gene_aliases(self):
+        return self.genes().gene_aliases()
+    
+    def pathways(self, with_ids=None):
+        if with_ids is not None:
+            return self.api.get_pathways_by_genes(with_ids)
+        else:
+            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
+    
+    def list_pathways(self):
+        return self.pathways()
+    
+    def get_linked_pathways(self, pathway_id):
+        self.api.get_linked_pathways(pathway_id)
+        
+    def enzymes(self, genes=None):
+        raise NotImplementedError()
+    
+    def _gm_gene_aliases(self):
+        """
+        Return a list of sets of equal genes. This is a hack for
+        gene matchers to work faster until the whole implementations
+        transitions to REST. Does not include links to DBs.
+        """
+        s1 = urllib2.urlopen("http://rest.kegg.jp/list/%s" % self.org_code).read()
+        out = []
+        for l in s1.split('\n'):
+            if l:
+                tabs = l.split("\t")
+                cset = set([tabs[0]])
+                try:
+                    rest = tabs[1].split(";")[0]
+                    cset |= set(rest.split(", "))
+                except:
+                    pass #do not crash if a line does not conform
+                out.append(cset)
+        return out
+
+    def get_enriched_pathways(self, genes, reference=None, prob=obiProb.Binomial(), callback=None):
+        """ Return a dictionary with enriched pathways ids as keys
+        and (list_of_genes, p_value, num_of_reference_genes) tuples 
+        as items.
+        
+        """
+        allPathways = defaultdict(lambda :[[], 1.0, []])
+        from Orange.orng import orngMisc
+        milestones = orngMisc.progressBarMilestones(len(genes), 100)
+        pathways_db = KEGGPathways()
+        
+        pathways_for_gene = []
+        for i, gene in enumerate(genes):
+            pathways_for_gene.append(self.pathways([gene]))
+            if callback and i in milestones:
+                callback(i*50.0/len(genes))
+                
+        # precache for speed 
+        pathways_db.pre_cache([pid for pfg in pathways_for_gene for pid in pfg]) 
+        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
+            for pathway in pathways:
+                if pathways_db.get_entry(pathway).gene: 
+                    allPathways[pathway][0].append(gene)
+            if callback and i in milestones:
+                callback(50.0 + i*50.0/len(genes))
+        reference = set(reference if reference is not None else self.genes.keys())
+        
+        pItems = allPathways.items()
+        
+        for i, (p_id, entry) in enumerate(pItems):
+            pathway = pathways_db.get_entry(p_id)
+            entry[2].extend(reference.intersection(pathway.gene or []))
+            entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes))
+        return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()])
+        
+    def get_genes_by_enzyme(self, enzyme):
+        enzyme = Enzymes().get_entry(enzyme)
+        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
+    
+    def get_genes_by_pathway(self, pathway_id):
+        return KEGGPathway(pathway_id).genes()
+    
+    def get_enzymes_by_pathway(self, pathway_id):
+        return KEGGPathway(pathway_id).enzymes()
+    
+    def get_compounds_by_pathway(self, pathway_id):
+        return KEGGPathway(pathway_id).compounds()
+    
+    def get_pathways_by_genes(self, gene_ids):
+        return self.api.get_pathways_by_genes(gene_ids)
+        gene_ids = set(gene_ids)
+        pathways = [self.genes[id].pathway for id in gene_ids if self.genes[id].pathway]
+        pathways = reduce(set.union, pathways, set())
+        return [id for id in pathways if gene_ids.issubset(KEGGPathway(id).genes())] 
+    
+    def get_pathways_by_enzymes(self, enzyme_ids):
+        enzyme_ids = set(enzyme_ids)
+        pathways = [KEGGEnzymes()[id].pathway for id in enzyme_ids]
+        pathwats = reduce(set.union, pathways, set())
+        return [id for id in pathways if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
+    
+    def get_pathways_by_compounds(self, compound_ids):
+        compound_ids = set(compound_ids)
+        pathways = [KEGGCompounds()[id].pathway for id in compound_ids]
+        pathwats = reduce(set.union, pathways, set())
+        return [id for id in pathways if compound_ids.issubset(KEGGPathway(id).compounds())]
+    
+    def get_enzymes_by_compound(self, compound_id):
+        return KEGGCompound()[compound_id].enzyme
+    
+    def get_enzymes_by_gene(self, gene_id):
+        return self.genes[gene_id].enzymes
+    
+    def get_compounds_by_enzyme(self, enzyme_id):
+        return self._enzymes_to_compounds.get(enzyme_id)
+    
+    @deprecated_keywords({"caseSensitive": "case_sensitive"})
+    def get_unique_gene_ids(self, genes, case_sensitive=True):
+        """Return a tuple with three elements. The first is a dictionary mapping from unique gene
+        ids to gene names in genes, the second is a list of conflicting gene names and the third is a list
+        of unknown genes.
+        """
+        unique, conflicting, unknown = {}, [], []
+        for gene in genes:
+            names = self.genematcher.match(gene)
+            if len(names) == 1:
+                unique[names[0]] = gene
+            elif len(names) == 0:
+                unknown.append(gene)
+            else:
+                conflicting.append(gene)
+        return unique, conflicting, unknown
+    
+    def get_genes(self):
+        return self.genes
+    
+    @classmethod
+    def organism_name_search(cls, name):
+        genome = KEGGGenome()
+        if name not in genome:
+            ids = genome.search(name)
+            if not ids:
+                from .. import obiTaxonomy
+                ids = obiTaxonomy.search(name)
+                ids = [id for id in ids if genome.search(id)]
+            name = ids.pop(0) if ids else name
+            
+        try:
+            return genome[name].entry_key
+        except KeyError:
+            raise OrganismNotFoundError(name)
+        
+    @classmethod
+    def organism_version(cls, name):
+        name = cls.organism_name_search(name)
+        genome = KEGGGenome()
+        info = genome.api.binfo(name)
+        return info.release
+    
+    def _set_genematcher(self, genematcher):
+        setattr(self, "_genematcher", genematcher)
+        
+    def _get_genematcher(self):
+        if getattr(self, "_genematcher", None) == None:
+            from .. import obiGene
+            if self.org_code == "ddi":
+                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
+                                                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]])
+            else:
+                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code)])
+            self._genematcher.set_targets(self.genes.keys())
+        return self._genematcher
+    
+    genematcher = property(_get_genematcher, _set_genematcher)
+    
+KEGGOrganism = Organism
+    
+def organism_name_search(name):
+    return KEGGOrganism.organism_name_search(name)
+
+def pathways(org):
+    return KEGGPathway.list(org)
+
+def organisms():
+    return KEGGOrganism.organisms()
+
+def from_taxid(taxid):
+    genome = KEGGGenome()
+    res = genome.search(taxid)
+    for r in res:
+        e = genome[r]
+        
+        if e.taxid in [taxid,  genome.TAXID_MAP.get(taxid, taxid)]:
+            return e.org_code()
+        
+    return None
+
+def to_taxid(name):
+    genome = KEGGGenome()
+    if name in genome:
+        return genome[name].taxid
+    
+    keys = genome.search(name)
+    if keys:
+        return genome[keys[0]].taxid
+    else:
+        return None
+
+def create_gene_sets():
+    pass
+
+def main():
+    KEGGGenome()
+    import doctest
+    extraglobs = {"api": KeggApi()}
+    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
+
+if __name__ == "__main__":
+    sys.exit(main())

_bioinformatics/obiKEGG/api.py

+"""
+KEGG api interface.
+
+"""
+from __future__ import absolute_import
+
+from contextlib import closing
+
+from .service import web_service
+from .types import *
+
+class KeggApi(object):
+    """ KEGG API """
+    
+    def __init__(self):
+        self.service = web_service()
+        
+    ##################
+    # Meta information
+    ##################
+    
+    def list_databases(self):
+        """ Returns a list of available databases.
+        
+        >>> api.list_databases()
+        [Definition(entry_id='nt',...
+         
+        """
+        return map(Definition.from_items, self.service.list_databases())
+    
+    def list_organisms(self):
+        """ Return a list of all available organisms
+        
+        >>> api.list_organisms()
+        [Definition(entry_id='hsa',...
+        
+        """
+        return map(Definition.from_items, self.service.list_organisms())
+    
+    def list_pathways(self, organism):
+        """ Return a list of all available pathways for `organism`
+        
+        >>> api.list_pathways("hsa")
+        [Definition(entry_id=',...
+        
+        """
+        return map(Definition.from_items, self.service.list_pathways(organism))
+        
+    #######
+    # DBGET
+    #######
+     
+    def binfo(self, db):
+        """ Return info for database `db`
+        
+        >>> print api.dbinfo("gb")
+        genbank          GenBank nucleic acid sequence database
+        gb               Release 186.0, Oct 11
+                         National Center for Biotechnology Information
+                         144,458,648 entries, 132,067,413,372 bases
+                         Last update: 11/10/24
+                         <dbget> <fasta> <blast>
+                         
+        """
+        result = self.service.binfo(db)
+        if result is not None:
+            return BInfo.from_text(str(result))
+        else:
+            return result
+    
+    def bfind(self, db, keywords):
+        """ Search database 'db' for keywords
+        """
+        result = self.service.bfind(" ".join([db, keywords]))
+        if result is not None:
+            return str(result)
+        else:
+            return result
+    
+    def bget(self, ids):
+        """
+        """
+        if not isinstance(ids, basestring):
+            # Sequence of ids
+            ids = " ".join(ids)
+        result = self.service.bget(ids)
+        if result is not None:
+            return str(result)
+        else:
+            return result
+    
+    def btit(self, ids):
+        """
+        """
+        if not isinstance(ids, basestring):
+            ids = " ".join(ids)
+            
+        result = self.service.btit(ids)
+        if result is not None:
+            return str(result)
+        else:
+            return result
+    
+    def bconv(self, ids):
+        if not isinstance(ids, basestring):
+            ids = " ".join(ids)
+            
+        result = self.service.bconv(ids)
+        if result is not None:
+            return str(result)
+        else:
+            return result
+    
+    ########
+    # LinkDB
+    ########
+    
+    def get_linkdb_by_entry(self, entry_id, db, offset, limit):
+        links = self.service.get_linkdb_by_entry(entry_id, db, offset, limit)
+        return [LinkDBRelation(**d) for d in \
+                map(dict, links)]
+        
+    def get_linkdb_between_databases(self, from_db, to_db, offset, limit):
+        links = self.service.get_linkdb_between_databases(from_db, to_db, offset, limit)
+        return [LinkDBRelation(**d) for d in \
+                map(dict, links)]
+        
+    def get_genes_by_enzyme(self, enzyme_id, org):
+        return self.service.get_genes_by_enzyme(enzyme_id, org)
+    
+    def get_enzymes_by_gene(self, genes_id):
+        return self.service.get_enzymes_by_gene(genes_id)
+    
+    def get_enzymes_by_compound(self, compound_id):
+        return self.service.get_enzymes_by_compound(compound_id)
+    
+    def get_enzymes_by_glycan(self, glycan_id):
+        return self.service.get_enzymes_by_glycan(glycan_id)
+    
+    def get_enzymes_by_reaction(self, reaction_id):
+        return self.service.get_enzymes_by_reaction(reaction_id)
+    
+    def get_compounds_by_enzyme(self, enzyme_id):
+        return self.service.get_compounds_by_enzyme(enzyme_id)
+    
+    def get_compounds_by_reaction(self, reaction_id):
+        return self.service.get_compounds_by_reaction(reaction_id)
+    
+    def get_glycans_by_enzyme(self, enzyme_id):
+        return self.service.get_glycans_by_enzyme(enzyme_id)
+    
+    def get_glycans_by_reaction(self, reaction_id):
+        return self.service.get_glycans_by_reaction(reaction_id)
+    
+    def get_reactions_by_enzyme(self, enzyme_id):
+        return self.service.get_reactions_by_enzyme(enzyme_id)
+    
+    def get_reactions_by_compound(self, compound_id):
+        return self.service.get_reactions_by_compound(compound_id)
+    
+    def get_reactions_by_glycan(self, glycan_id):
+        return self.service.get_reactions_by_glycan(glycan_id)
+    
+    ######
+    # SSDB
+    ######
+    
+    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
+        ssr = self.service.get_best_best_neighbors_by_gene(genes_id, offset, limit)
+        return [SSDBRelation(**d) for d in \
+                map(dict, ssr)]
+    
+    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
+        ssr = self.service.get_best_neighbors_by_gene(genes_id, offset, limit)
+        return [SSDBRelation(**d) for d in \
+                map(dict, ssr)]
+    
+    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
+        ssr = self.service.get_reverse_best_neighbors_by_gene(genes_id, offset, limit)
+        return [SSDBRelation(**d) for d in \
+                map(dict, ssr)]
+    
+    def get_paralogs_by_gene(self, genes_id, offset, limit):
+        ssr =  self.service.get_paralogs_by_gene(genes_id, offset, limit)
+        return [SSDBRelation(**d) for d in \
+                map(dict, ssr)]
+    
+    #######
+    # Motif
+    #######
+    
+    def get_motifs_by_gene(self, genes_id, db):
+        motif = self.service.get_motifs_by_gene(genes_id, db)
+        return [MotifResult(**d) for d in \
+                map(dict, motif)]
+    
+    def get_genes_by_motifs(self, motif_id_list, offset, limit):
+        genes = self.service.get_genes_by_motifs(motif_id_list, offset, limit)
+        return [Definition(**d) for d in \
+                map(dict, genes)]
+    
+    ####
+    # KO
+    ####
+    
+    def get_ko_by_gene(self, genes_id):
+        return self.service.get_ko_by_gene(genes_id)
+    
+    def get_ko_by_ko_class(self, ko_class_id):
+        return self.service.get_ko_by_ko_class(ko_class_id)
+    
+    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
+        return self.service.get_genes_by_ko_class(ko_class_id, org, offset, limit)
+    
+    def get_genes_by_ko(self, ko_id, org):
+        return self.service.get_genes_by_ko(ko_id, org)
+    
+    #########
+    # Pathway
+    #########
+    
+    def mark_pathway_by_objects(self, pathway_id, object_id_list):
+        return self.service.mark_pathway_by_objects(pathway_id, object_id_list)
+    
+    def color_pathway_by_objects(self, pathway_id, object_id_list, fg_color_list, bg_color_list):
+        return self.service.color_pathway_by_objects(pathway_id, object_id_list, fg_color_list, bg_color_list)
+    
+    def color_pathway_by_elements(self, pathway_id, element_id_list, fg_color_list, bg_color_list):
+        return self.service.color_pathway_by_elements(pathway_id, element_id_list, fg_color_list, bg_color_list)
+    
+    def get_html_of_marked_pathway_by_objects(self, pathway_id, object_id_list):
+        return self.service.get_html_of_marked_pathway_by_objects(pathway_id, object_id_list)
+    
+    def get_html_of_colored_pathway_by_objects(self, pathway_id, object_id_list, fg_color_list, bg_color_list):
+        return self.service.get_html_of_colored_pathway_by_objects(pathway_id, object_id_list, fg_color_list, bg_color_list)
+    
+    def get_html_of_colored_pathway_by_elements(self, pathway_id, element_id_list, fg_color_list, bg_color_list):
+        return self.service.get_html_of_colored_pathway_by_elements(pathway_id, element_id_list, fg_color_list, bg_color_list)
+    
+    def get_references_by_pathway(self, pathway_id):
+        return self.service.get_references_by_pathway(pathway_id)
+    
+    def get_element_relations_by_pathway(self, pathway_id):
+        return self.service.get_element_relations_by_pathway(pathway_id)
+    
+    
+    
+    def get_genes_by_organism(self, organism, offset=None, limit=None):
+        if offset is None and limit is None:
+            offset = 0
+            limit = self.get_number_of_genes_by_organism(organism)
+            
+        return self.service.get_genes_by_organism(organism, offset, limit)
+    
+    def get_number_of_genes_by_organism(self, organism):
+        return self.service.get_number_of_genes_by_organism(organism)
+    
+    ####################
+    # Objects by pathway
+    ####################
+    
+    def get_elements_by_pathway(self, pathway_id):
+        return self.service.get_elements_by_pathway(pathway_id)
+    
+    def get_genes_by_pathway(self, pathway_id):
+        return self.service.get_genes_by_pathway(pathway_id)
+    
+    def get_enzymes_by_pathway(self, pathway_id):
+        return self.service.get_enzymes_by_pathway(pathway_id)
+    
+    def get_compounds_by_pathway(self, pathway_id):
+        return self.service.get_compounds_by_pathway(pathway_id)
+    
+    def get_drugs_by_pathway(self, pathway_id):
+        return self.service.get_drugs_by_pathway(pathway_id)
+    
+    def get_glycans_by_pathway(self, pathway_id):
+        return self.service.get_glycans_by_pathway(pathway_id)
+    
+    def get_reactions_by_pathway(self, pathway_id):
+        return self.get_reactions_by_pathway(pathway_id)
+    
+    def get_kos_by_pathway(self, pathway_id):
+        return self.service.get_kos_by_pathway(pathway_id)
+    
+    #####################
+    # Pathways by objects
+    #####################
+    
+    def get_pathways_by_genes(self, gene_list):
+        return map(str, self.service.get_pathways_by_genes(gene_list))
+    
+    def get_pathways_by_enzymes(self, enzyme_list):
+        return map(str, self.service.get_pathways_by_enzymes(enzyme_list))
+    
+    def get_pathways_by_compounds(self, compound_list):
+        return map(str, self.service.get_pathways_by_compounds(compound_list))
+    
+    def get_pathways_by_drugs(self, drug_list):
+        return map(str, self.service.get_pathways_by_drugs(drug_list))
+    
+    def get_pathways_by_glycans(self, glycan_list):
+        return map(str, self.service.get_pathways_by_glycans(glycan_list))
+    
+    def get_pathways_by_reactions(self, reaction_list):
+        return map(str, self.service.get_pathways_by_reactions(reaction_list))
+    
+    def get_pathways_by_kos(self, ko_list):
+        return map(str, self.service.get_pathways_by_kos(ko_list))
+    
+    ##########################
+    # Relations among pathways
+    ##########################
+    
+    def get_linked_pathways(self, pathway_id):
+        if not pathway_id.startswith("path:"):
+            pathway_id = "path:" + pathway_id
+        return map(str, self.service.get_linked_pathways(pathway_id))
+    
+    
+"""
+KEGG api with caching
+"""
+
+import os
+
+from . import caching
+from .caching import cached_method, cache_entry, touch_dir
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # TODO: move a copy of lru_cache in .caching if distributing this as a
+    # standalone package
+    from Orange.utils import lru_cache
+
+    
+class CachedKeggApi(KeggApi):
+    def __init__(self, store=None):
+        KeggApi.__init__(self)
+        if store is None:
+            self.store = {}
+    
+    # Needed API for cached decorator.
+    def cache_store(self):
+        from . import conf
+        path = conf.params["cache.path"]
+        touch_dir(path)
+        return caching.Sqlite3Store(os.path.join(path,
+                                                 "kegg_api_cache_1.sqlite3"))
+    
+    def last_modified(self, args, kwargs=None):
+        return getattr(self, "default_release", "")
+    
+    def set_default_release(self, release):
+        self.default_release = release
+        
+    
+    ##################
+    # Meta information
+    ##################
+    
+    @lru_cache() # not persistently cached
+    def list_databases(self):
+        return KeggApi.list_databases(self)
+    
+    @cached_method
+    def list_organisms(self):
+        return KeggApi.list_organisms(self)
+    
+    @cached_method
+    def list_pathways(self, organism):
+        return KeggApi.list_pathways(self, organism)
+    
+    #######
+    # DBGET
+    #######
+    
+    @lru_cache() # not persistently cached
+    def binfo(self, db):
+        return KeggApi.binfo(self, db)
+    
+    @cached_method
+    def bfind(self, db, keywords):
+        return KeggApi.bfind(self, db, keywords)
+    
+    @cached_method
+    def bget(self, ids):
+        rval = KeggApi.bget(self, ids)
+        return rval
+    
+    @cached_method
+    def bget(self, ids):
+        if not isinstance(ids, basestring):
+            return self._batch_bget(ids)
+        else:
+            return KeggApi.bget(self, ids)
+        
+    def _batch_bget(self, ids):
+        if len(ids) > 100:
+            raise ValueError("Can batch at most 100 ids at a time.")
+        
+        bget = self.bget
+        uncached = []
+        with closing(bget.cache_store()) as store:
+            # Which ids are already cached
+            # TODO: Invalidate entries by release string.
+            for id in ids:
+                key = bget.key_from_args((id,))
+                if key not in store:
+                    uncached.append(id)
+                
+        if uncached:
+            # in case there are duplicate ids
+            uncached = sorted(set(uncached))
+            rval = KeggApi.bget(self, uncached)
+            if rval is not None:
+                entrys = rval.split("///\n")
+            else:
+                entrys = []
+                
+            if entrys and not entrys[-1].strip():
+                # Delete the last newline if present
+                del entrys[-1]
+            
+            if len(entrys) == len(uncached):
+                with closing(bget.cache_store()) as store:
+                    for id, entry in zip(uncached, entrys):
+                        key = bget.key_from_args((id,))
+                        if entry is not None:
+                            entry = entry + "///\n"
+                        store[key] = cache_entry(entry, mtime=datetime.now())
+                        
+            else:
+                # Try to bisect the uncached list
+                if len(uncached) > 1 and len(uncached) - len(entrys) < 4:
+                    split = len(uncached) / 2
+                    self._batch_bget(uncached[:split])
+                    self._batch_bget(uncached[split:])
+                else:
+                    import warnings
+                    warnings.warn("Batch contains invalid ids", UserWarning)
+        
+        # Finally join all the results, but drop all None objects
+        entries = filter(lambda e: e is not None, map(bget, ids))
+        
+        rval = "".join(entries)
+        return rval
+    
+    @cached_method
+    def btit(self, ids):
+        return KeggApi.btit(self, ids)
+    
+    @cached_method
+    def bconv(self, ids):
+        return KeggApi.bconv(self, ids)
+    
+    ########
+    # LinkDB
+    ########
+    
+    @cached_method
+    def get_linkdb_by_entry(self, entry_id, db, offset, limit):
+       return KeggApi.get_linkdb_by_entry(self, entry_id, db, offset, limit)
+        
+    @cached_method
+    def get_linkdb_between_databases(self, from_db, to_db, offset, limit):
+        return KeggApi.get_linkdb_between_databases(self, from_db, to_db, offset, limit)
+            
+    @cached_method
+    def get_genes_by_enzyme(self, enzyme_id, org):
+        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
+    
+    @cached_method
+    def get_enzymes_by_gene(self, genes_id):
+        return KeggApi.get_enzymes_by_gene(self, genes_id)
+    
+    @cached_method
+    def get_enzymes_by_compound(self, compound_id):
+        return KeggApi.get_enzymes_by_compound(self, compound_id)
+    
+    @cached_method
+    def get_enzymes_by_glycan(self, glycan_id):
+        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
+    
+    @cached_method
+    def get_enzymes_by_reaction(self, reaction_id):
+        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
+    
+    @cached_method
+    def get_compounds_by_enzyme(self, enzyme_id):
+        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
+    
+    @cached_method
+    def get_compounds_by_reaction(self, reaction_id):
+        return KeggApi.get_compounds_by_reaction(self, reaction_id)
+    
+    @cached_method
+    def get_glycans_by_enzyme(self, enzyme_id):
+        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
+    
+    @cached_method
+    def get_glycans_by_reaction(self, reaction_id):
+        return KeggApi.get_glycans_by_reaction(self, reaction_id)
+    
+    @cached_method
+    def get_reactions_by_enzyme(self, enzyme_id):
+        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
+    
+    @cached_method
+    def get_reactions_by_compound(self, compound_id):
+        return KeggApi.get_reactions_by_compound(self, compound_id)
+    
+    @cached_method
+    def get_reactions_by_glycan(self, glycan_id):
+        return KeggApi.get_reactions_by_glycan(self, glycan_id)
+    
+    ######
+    # SSDB
+    ######
+    
+    @cached_method
+    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
+        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset, limit)
+    
+    @cached_method
+    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
+        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset, limit)
+    
+    @cached_method
+    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
+        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit)
+    
+    @cached_method
+    def get_paralogs_by_gene(self, genes_id, offset, limit):
+        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
+    
+    #######
+    # Motif
+    #######
+    
+    @cached_method
+    def get_motifs_by_gene(self, genes_id, db):
+        return KeggApi.get_motifs_by_gene(self, genes_id, db)
+    
+    @cached_method
+    def get_genes_by_motifs(self, motif_id_list, offset, limit):
+        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
+
+    ####
+    # KO
+    ####
+    
+    @cached_method
+    def get_ko_by_gene(self, genes_id):
+        return KeggApi.get_ko_by_gene(self, genes_id)
+    
+    @cached_method
+    def get_ko_by_ko_class(self, ko_class_id):
+        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
+    
+    @cached_method
+    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
+        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset, limit)
+    
+    @cached_method
+    def get_genes_by_ko(self, ko_id, org):
+        return KeggApi.get_genes_by_ko(self, ko_id, org)
+    
+    #########
+    # Pathway
+    #########
+    
+    # TODO
+    
+    
+    
+    @cached_method
+    def get_genes_by_organism(self, organism, offset=None, limit=None):
+        return KeggApi.get_genes_by_organism(self, organism, offset=offset, limit=limit)
+    
+    @cached_method
+    def get_number_of_genes_by_organism(self, organism):
+        return KeggApi.get_number_of_genes_by_organism(self, organism)
+     
+    @cached_method
+    def get_pathways_by_genes(self, gene_list):
+        return KeggApi.get_pathways_by_genes(self, gene_list)
+    
+    @cached_method
+    def get_pathways_by_enzymes(self, enzyme_list):
+        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
+    
+    @cached_method
+    def get_pathways_by_compounds(self, compound_list):
+        return KeggApi.get_pathways_by_compounds(self, compound_list)
+    
+    @cached_method
+    def get_pathways_by_drugs(self, drug_list):
+        return KeggApi.get_pathways_by_drugs(self, drug_list)
+    
+    @cached_method
+    def get_pathways_by_glycans(self, glycan_list):
+        return KeggApi.get_pathways_by_glycans(self, glycan_list)
+    
+    @cached_method
+    def get_pathways_by_reactions(self, reaction_list):
+        return KeggApi.get_pathways_by_reactions(self, reaction_list)
+    
+    @cached_method
+    def get_pathways_by_kos(self, ko_list):
+        return KeggApi.get_pathways_by_kos(self, ko_list)
+    
+    @cached_method
+    def get_elements_by_pathway(self, pathway_id):
+        return KeggApi.get_elements_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_genes_by_pathway(self, pathway_id):
+        return KeggApi.get_genes_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_enzymes_by_pathway(self, pathway_id):
+        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_compounds_by_pathway(self, pathway_id):
+        return KeggApi.get_compounds_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_drugs_by_pathway(self, pathway_id):
+        return KeggApi.get_drugs_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_glycans_by_pathway(self, pathway_id):
+        return KeggApi.get_glycans_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_reactions_by_pathway(self, pathway_id):
+        return KeggApi.get_reactions_by_pathway(self, pathway_id)
+    
+    @cached_method
+    def get_kos_by_pathway(self, pathway_id):
+        return KeggApi.get_kos_by_pathway(self, pathway_id)
+    

_bioinformatics/obiKEGG/brite.py

+"""
+KEGG brite
+
+"""
+from __future__ import absolute_import
+
+import os
+import re
+import urllib2
+
+from Orange.utils import deprecated_attribute
+
+from . import conf
+class BriteEntry(object):
+    _search_re = {"ids": re.compile('(?P<ids>\[.*:.*\])'),
+                  "title": re.compile(r'(<[Bb]>)?(?P<title>\b[a-zA-Z0-9_/\s,;:.+=\-\[\]{}\(\)]+?)(?(1)</[Bb]>)$'),
+                  "links": re.compile('(?P<links><a href=".+?">.*?</a>)')}
+    def __init__(self, line):
+        self.entries = []
+        self.line = line[1:].strip()
+        for name, re in self._search_re.items():
+            search = re.search(self.line)
+            setattr(self, name, search.group(name) if search else None)
+
+    def __iter__(self):
+        return iter(self.entries)
+
+    entrys = deprecated_attribute("entrys", "entries")
+
+class Brite(BriteEntry):
+    VERSION = "v1.0"
+    BRITE_URL_FORMAT = "http://www.genome.jp/kegg-bin/download_htext?htext={brite_id}.keg&format=htext&filedir="
+    
+    def __init__(self, brite_id, local_cache=None):
+        super(Brite, self).__init__("")
+        self.brite_id = id
+        if local_cache is None:
+            local_cache = conf.params["cache.path"]
+        self.local_cache = local_cache
+        
+        self.load(brite_id)
+    
+    def _get_brite(self, brite_id):
+        url = self.BRITE_URL_FORMAT.format(brite_id=brite_id)
+        local_filename = os.path.join(self.local_cache, brite_id + ".keg")
+        if not os.path.exists(local_filename):
+            brite = urllib2.urlopen(url).read()
+            with open(local_filename, "wb") as f:
+                f.write(brite)
+                
+        return open(local_filename, "rb")
+        
+    def load(self, brite_id):
+        lines = self._get_brite(brite_id).read().split("\n!\n")[1].splitlines()
+        
+        # TODO: Implement a proper parser
+        
+        def collect(lines, depth, collection):
+            while lines:
+                line = lines[0]
+                if line.startswith("#"):
+                    lines.pop(0)
+                elif line.startswith(depth) and len(line.strip()) > 1:
+                    collection.append(BriteEntry(lines.pop(0))) 
+                elif line[0] > depth:
+                    collect(lines, line[0], collection[-1].entries)
+                elif line[0] < depth:
+                    return
+                else:
+                    lines.pop(0)
+                        
+        collect([line for line in lines if not line.startswith("#") and len(line) > 1], "A", self.entries)

_bioinformatics/obiKEGG/caching.py

+"""
+Caching framework for cached kegg api calls.
+ 
+"""
+import os
+import UserDict
+import sqlite3
+import cPickle as pickle
+
+from datetime import datetime, date, timedelta
+from . import conf
+
+class Store(object):
+    def __init__(self):
+        self.timestamp = 0
+        
+    def open(self):
+        raise NotImplementedError
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, *args):
+        pass
+
+class Sqlite3Store(Store, UserDict.DictMixin):
+    def __init__(self, filename):
+        self.filename = filename
+        self.con = sqlite3.connect(filename)
+        #self.con = sqlite3.connect(":memory:")
+        self.con.execute("""
+        CREATE TABLE IF NOT EXISTS cache 
+            (key TEXT UNIQUE,
+             value TEXT
+            )
+        """)
+        self.con.execute("""
+        CREATE INDEX IF NOT EXISTS cache_index
+        ON cache (key)
+        """)
+        self.con.commit()
+        
+    def __getitem__(self, key):
+        cur = self.con.execute("""
+            SELECT value
+            FROM cache
+            WHERE key=?
+        """, (key,))
+        r = cur.fetchall()
+        
+        if not r:
+            raise KeyError(key)
+        else:
+            return pickle.loads(str(r[0][0]))
+    
+    def __setitem__(self, key, value):
+        value = pickle.dumps(value)
+        self.con.execute("""
+            INSERT OR REPLACE INTO cache
+            VALUES (?, ?)
+        """, (key, value))
+        self.con.commit()
+        
+    def __delitem__(self, key):
+        self.con.execute("""
+            DELETE FROM cache
+            WHERE key=?
+        """, (key,))
+        self.con.commit()
+        
+    def keys(self):
+        cur = self.con.execute("""
+            SELECT key
+            FROM cache
+        """)
+        return [str(r[0]) for r in cur.fetchall()]
+        
+    def close(self):
+        pass
+    
+    
+class DictStore(Store, UserDict.DictMixin):
+    def __init__(self):
+        Store.__init__(self)
+        
+    def close(self):
+        pass
+    
+    
+from functools import wraps
+from contextlib import closing
+
+
+class cache_entry(object):
+    def __init__(self, value, mtime=None, expires=None):
+        self.value = value
+        self.mtime = mtime
+        self.expires = expires
+        
+_SESSION_START = datetime.now()
+
+class cached_wrapper(object):
+    """ TODO: needs documentation
+    """
+    def __init__(self, function, instance, class_, cache_store, last_modified=None):
+        self.function = function
+        self.instance = instance
+        self.class_ = class_
+        self.cache_store = cache_store
+        self.last_modified = last_modified
+        
+    def has_key(self, key):
+        with closing(self.cache_store()) as store:
+            return key in store
+    
+    def key_from_args(self, args, kwargs=None):
+        key = self.function.__name__ + repr(args)
+        return key
+    
+    def invalidate_key(self, key):
+        with closing(self.cache_store()) as store:
+            del store[key]
+            
+    def last_modified_from_args(self, args, kwargs=None):
+        key = self.key_from_args(args, kwargs)
+        if self.instance is not None:
+            self.instance.last_modified(args)
+        
+    def invalidate_args(self, args):
+        return self.invalidate_key(self.key_from_args(args))
+        
+    def invalidate_all(self):
+        prefix = self.key_from_args(()).rstrip(",)")
+        with self.cache_store() as store:
+            for key in store.keys():
+                if key.startswith(prefix):
+                    del store[key]
+    
+    def memoize(self, args, kwargs, value, timestamp=None):
+        key = self.key_from_args(args, kwargs)
+        if timestamp is None:
+            timestamp = datetime.now()
+            
+        with closing(self.cache_store()) as store:
+            store[key] = cache_entry(value, mtime=timestamp)
+        
+    def __call__(self, *args):
+        key = self.key_from_args(args)
+        with closing(self.cache_store()) as store:
+            valid = True
+            if key not in store:
+                valid = False
+            else:
+                entry = store[key]
+                rval = entry.value
+                
+                if not self.is_entry_valid(entry, args):
+                    valid = False
+            if not valid:
+                rval = self.function(self.instance, *args)
+                store[key] = cache_entry(rval, datetime.now(), None)
+        
+        return rval
+        
+    def min_timestamp(self, args):
+        key = self.key_from_args(args)
+        return datetime.fromtimestamp(0)
+    
+    def is_entry_valid(self, entry, args):
+        # Need to check datetime first (it subclasses date)
+        if isinstance(entry.mtime, datetime):
+            mtime = entry.mtime
+        elif isinstance(entry.mtime, date):
+            mtime = datetime(entry.mtime.year, entry.mtime.month,
+                             entry.mtime.day, 1, 1, 1)
+        else:
+            return False
+        
+        if self.min_timestamp(args) > mtime:
+            return False
+        
+        last_modified = self.last_modified_from_args(args)
+        
+        if isinstance(last_modified, date):
+            last_modified = datetime(last_modified.year, last_modified.month,
+                                     last_modified.day, 1, 1, 1)
+        elif isinstance(last_modified, basestring):
+            # Could have different format
+            mtime = mtime.strftime("%Y %m %d %H %M %S") 
+        
+        elif last_modified is None:
+            if conf.params["cache.invalidate"] == "always":
+                return False
+            elif conf.params["cache.invalidate"] == "session":
+                last_modified = _SESSION_START
+            elif conf.params["cache.invalidate"] == "daily":
+                last_modified = datetime.now().replace(hour=0, minute=0,
+                                                       second=0, microsecond=0)
+            elif conf.params["cache.invalidate"] == "weekly":
+                last_modified = datetime.now() - timedelta(7)
+            else: # ???
+                pass
+        return last_modified <= mtime
+        
+class cached_method(object):
+    def __init__(self, function):
+        self.function = function
+        
+    def __get__(self, instance, owner):
+        if instance is not None:
+            return cached_wrapper(self.function, instance, owner,
+                                  self.get_cache_store(instance, owner))
+        return self
+    
+    def get_cache_store(self, instance, owner):
+        if hasattr(instance, "cache_store"):
+            return instance.cache_store
+        elif not hasattr("_cached_method_cache"):
+            instance._cached_method_cache = DictStore()
+        return instance._cached_method_cache
+
+
+class bget_cached_method(cached_method):
+    def __get__(self, instance, owner):
+        if instance is not None:
+            return cached_wrapper(self.function, instance, owner,
+                                  self.get_cache_store(instance, owner),
+                                  self.get_last_modified(instance, owner))
+        return self
+    
+    def get_last_modified(self, instance, owner):
+        if hasattr(instance, "last_modified"):
+            return instance.last_modified
+    
+def touch_dir(path):
+    path = os.path.expanduser(path)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    
+def clear_cache():
+    """Clear all locally cached KEGG data.
+    """
+    import glob
+    path = conf.params["cache.path"]
+    if os.path.realpath(path) != os.path.realpath(conf.kegg_dir):
+        raise Exception("Non default cache path. Please remove the contents of %r manually." % path)
+    
+    for cache_filename in glob.glob(os.path.join(path, "*.sqlite3")):
+        os.remove(cache_filename)
+        
+    for ko_filename in glob.glob(os.path.join(path, "*.keg")):
+        os.remove(ko_filename)
+        
+    for kgml_filename in glob.glob(os.path.join(path, "*.xml")):
+        os.remove(kgml_filename)
+        
+    for png_filename in glob.glob(os.path.join(path, "*.png")):
+        os.remove(png_filename)
+    

_bioinformatics/obiKEGG/conf.py

+"""
+obiKEGG2 configuration 
+
+mostly just caching settings
+
+"""
+
+import os
+import ConfigParser
+from StringIO import StringIO
+from Orange.utils import serverfiles
+kegg_dir = serverfiles.localpath("KEGG2")
+
+default = """
+[cache]
+# path = %(home)s/.obiKEGG/
+path = %(kegg_dir)s/
+store = sqlite3
+invalidate = weekly
+
+[service]
+transport = urllib2
+# transport = requests
+
+"""
+
+# Orange kegg files dir
+
+env = dict(os.environ)
+env["kegg_dir"] = kegg_dir
+
+parser = ConfigParser.ConfigParser(env)
+
+
+parser.readfp(StringIO(default), "default")
+
+# TODO: global settings rc file
+parser.read([os.path.expanduser("~/.obiKEGG/rc.cfg")])
+
+params = {}
+
+_ALL_PARAMS = \
+    ["cache.path",
+     "cache.store",
+     "cache.invalidate",
+     "service.transport"
+     ]
+for p in _ALL_PARAMS:
+    section, option = p.split(".")
+    params[p] = parser.get(section, option)
+    

_bioinformatics/obiKEGG/databases.py

+"""
+DBGET database
+"""
+from __future__ import absolute_import
+
+import re
+
+from . import entry
+from .entry import fields
+from . import api
+
+def iter_take(source_iter, n):
+    source_iter = iter(source_iter)
+    return [item for _, item in zip(range(n), source_iter)]
+
+def batch_iter(source_iter, n):
+    source_iter = iter(source_iter)
+    while True:
+        batch = iter_take(source_iter, n)
+        if batch:
+            yield batch
+        else:
+            break
+        
+def chain_iter(chains_iter):
+    for iter in chains_iter:
+        for element in iter:
+            yield element
+
+class DBDataBase(object):
+    """ A wrapper for DBGET database.
+    """
+    # ENTRY_TYPE constructor (type)
+    ENTRY_TYPE = entry.DBEntry
+    
+    # Needs to be set in a subclass or object instance 
+    DB = None
+    
+    def __init__(self, **kwargs):
+        if not self.DB:
+            raise TypeError("Cannot make an instance of abstract base class %r." \
+                            % type(self).__name__)
+            
+        self.api = api.CachedKeggApi()
+        self.info = self.api.binfo(self.DB)
+        release = self.info.release
+        self.api.set_default_release(release)
+        self._keys = []
+        
+    def keys(self):
+        return list(self._keys)
+    
+    def iterkeys(self):
+        return iter(self._keys)
+    
+    def items(self):
+        return list(zip(self.keys(), self.batch_get(self.keys())))
+    
+    def iteritems(self):
+        batch_size = 100
+        iterkeys = self.iterkeys()
+        return chain_iter(zip(batch, self.batch_get(batch))
+                          for batch in batch_iter(iterkeys, batch_size))
+        
+#        return ((key, self.__getitem__(key)) for key in self.iterkeys())
+    
+    def values(self):
+        return self.batch_get(self.keys())
+    
+    def itervalues(self):
+        batch_size = 100
+        iterkeys = self.iterkeys()
+        return chain_iter(self.batch_get(batch)
+                          for batch in batch_iter(iterkeys, batch_size))
+        
+#        return (self.__getitem__(key) for key in self.iterkeys())
+    
+    def get(self, key, default=None):
+        try:
+            return self.__getitem__(key)
+        except KeyError:
+            return default
+        
+    def has_key(self, key):
+        return self.__contains__(key)
+    
+    def __getitem__(self, key):
+        e = self.get_entry(key)
+        if e is None:
+            raise KeyError(key)
+        else:
+            return e
+    
+    def __contains__(self, key):
+        return key in set(self.keys())
+    
+    def __len__(self):
+        return len(self.keys())
+    
+    def __iter__(self):
+        return iter(self.keys())
+    
+    def get_text(self, key):
+        key = self._add_db(key)
+        return self.api.bget([key])
+    
+    def get_entry(self, key):
+        text = self.get_text(key)
+        if not text or text == "None":
+            return None
+        else:
+            return self.ENTRY_TYPE(text)
+        
+    def find(self, name):
+        """ Find ``name`` using BFIND. 
+        """
+        res = self.api.bfind(self.DB, name).splitlines()
+        return [r.split(" ", 1)[0] for r in res]    
+        
+    def pre_cache(self, keys=None, batch_size=100, progress_callback=None):
+        """ Retrive all the entries and cache them locally.
+        """
+        # TODO do this in multiple threads
+    
+        if not isinstance(self.api, api.CachedKeggApi):
+            raise TypeError("Not an an instance of api.CachedKeggApi")
+        
+        if batch_size > 100 or batch_size < 1:
+            raise ValueError("Invalid batch_size")
+        
+        if keys is None:
+            keys = self.keys()
+            
+        keys = list(keys)
+        start = 0
+        while start < len(keys):
+            batch = keys[start: start + batch_size]
+            batch = map(self._add_db, batch)
+            
+            self.api.bget(batch)
+            
+            if progress_callback:
+                progress_callback(100.0 * start / len(keys))
+                
+            start += batch_size
+            
+    def batch_get(self, keys):
+        """ Batch retrieve all entries for keys. This can be
+        significantly faster then getting each entry separately
+        especially if entries are not yet cached.
+        
+        """
+        entries = []
+        batch_size = 100
+        keys = list(keys)
+        start = 0
+        while start < len(keys):
+            batch = keys[start: start + batch_size]
+            batch = map(self._add_db, batch)
+            batch_entries = self.api.bget(batch)
+            if batch_entries is not None:
+                batch_entries = batch_entries.split("///\n")
+                # Remove possible empty last line  
+                batch_entries = [e for e in batch_entries if e.strip()]
+                entries.extend(map(self.ENTRY_TYPE, batch_entries))
+            start += batch_size
+            
+        return entries
+            
+    def _add_db(self, key):
+        """ Prefix the key with '%(DB)s:' string if not already
+        prefixed. 
+        """
+        if not key.startswith(self.DB + ":"):
+            return self.DB + ":" + key
+        else:
+            return key
+        
+    @property
+    def entries(self):
+        return self.values()
+    
+@entry.entry_decorate
+class GenomeEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("DEFINITION", fields.DBDefinitionField),
+              ("ANNOTATION", fields.DBSimpleField),
+              ("TAXONOMY", fields.DBTaxonomyField),
+              ("DATA_SOURCE", fields.DBSimpleField),
+              ("ORIGINAL_DB", fields.DBSimpleField),
+              ("KEYWORDS", fields.DBSimpleField),
+              ("DISEASE", fields.DBSimpleField),
+              ("COMMENT", fields.DBSimpleField),
+              ("CHROMOSOME", fields.DBFieldWithSubsections),
+              ("STATISTICS", fields.DBSimpleField),
+              ("REFERENCE", fields.DBReference)]
+    
+    MULTIPLE_FIELDS = ["REFERENCE"]
+    
+    def __init__(self, text):
+        entry.DBEntry.__init__(self, text)
+        
+    @property
+    def entry_key(self):
+        """ Primary entry key used for querying.
+        
+        .. note:: Unlike most of the other entry types this is the
+            first listed 'NAME'.
+            
+        """
+        
+        return self.name.split(",", 1)[0]
+
+    @property
+    def taxid(self):
+        return self.TAXONOMY.taxid
+            
+    def org_code(self):
+        if self.name is not None:
+            return self.name.split(",")[0]
+        else:
+            return self.entry.split(" ")[0]
+        
+
+class Genome(DBDataBase):
+    DB = "genome"
+    ENTRY_TYPE = GenomeEntry
+    
+    # For obiTaxonomy.common_taxids mapping
+    TAXID_MAP = {"562": "511145",   # Escherichia coli K-12 MG1655
+                 "2104": "272634",  # Mycoplasma pneumoniae M129 
+                 "4530": "39947",   # Oryza sativa ssp. japonica cultivar Nipponbare (Japanese rice)
+                 "4932" : "559292", # Saccharomyces cerevisiae S288C
+                 "4896": "284812",  # Schizosaccharomyces pombe 972h-
+                 }
+    
+    def __init__(self):
+        DBDataBase.__init__(self)
+        self._keys = [org.entry_id for org in self.api.list_organisms()]
+    
+    def _key_to_gn_entry_id(self, key):
+        res = self.find(key)
+        if len(res) == 0:
+            raise KeyError("Unknown key")
+        elif len(res) > 1:
+            raise ValueError("Not a unique key")
+        else:
+            return res[0]
+    
+    @classmethod
+    def common_organisms(cls):
+        return ['ath', 'bta', 'cel', 'cre', 'dre', 'ddi',
+                'dme', 'eco', 'hsa', 'mmu', 'mpn', 'osa',
+                'pfa', 'rno', 'sce', 'spo', 'zma', 'xla']
+        
+    @classmethod
+    def essential_organisms(cls):
+        return ['ddi', 'dme', 'hsa', 'mmu', 'sce']
+    
+    def search(self, string, relevance=False):
+        """ Search the genome database for string using ``bfind``.
+        """
+        if relevance:
+            raise NotImplementedError("relevance is no longer supported")
+        if string in self.TAXID_MAP:
+            string = self.TAXID_MAP[string]
+            
+        res = self.api.bfind(self.DB, string)
+        if not res:
+            return []
+        
+        res = res.splitlines()
+        res = [r.split(",", 1)[0] for r in res]
+        res = [r.split(" ", 1)[1] for r in res]
+        return res
+    
+    
+@entry.entry_decorate
+class GeneEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("DEFINITION", fields.DBDefinitionField),
+              ("ORGANISM", fields.DBSimpleField),
+              ("ORTHOLOGY", fields.DBSimpleField),
+              ("DRUG_TARGET", fields.DBSimpleField),
+              ("PATHWAY", fields.DBPathway),
+              ("MODULE", fields.DBSimpleField),
+              ("DISEASE", fields.DBSimpleField),
+              ("CLASS", fields.DBSimpleField),
+              ("POSITION", fields.DBSimpleField),
+              ("MOTIF", fields.DBSimpleField),
+              ("DBLINKS", fields.DBDBLinks),
+              ("STRUCTURE", fields.DBSimpleField),
+              ("AASEQ", fields.DBAASeq),
+              ("NTSEQ", fields.DBNTSeq)]
+    
+    def aliases(self):
+        return [self.entry_key] + (self.name.split(",") if self.name else []) + [link[1][0] for link in self.dblinks.items() if self.dblinks]
+
+    @property
+    def alt_names(self):
+        """ For backwards compatibility.
+        """
+        return self.aliases()
+  
+class Genes(DBDataBase):
+    DB = None # Needs to be set in __init__ 
+    ENTRY_TYPE = GeneEntry
+    
+    def __init__(self, org_code):
+        self.DB = org_code
+        self.org_code = org_code
+        DBDataBase.__init__(self)
+        self._keys = self.api.get_genes_by_organism(org_code)
+        
+    def gene_aliases(self):
+        aliases = {}
+        for entry in self.itervalues():
+            aliases.update(dict.fromkeys(entry.aliases(), self.org_code + ":" + entry.entry_key()))
+        return aliases
+    
+
+@entry.entry_decorate
+class CompoundEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("FORMULA", fields.DBSimpleField),
+              ("MASS", fields.DBSimpleField),
+              ("REMARK", fields.DBSimpleField),
+              ("REACTION", fields.DBSimpleField),
+              ("PATHWAY", fields.DBPathway),
+              ("ENZYME", fields.DBSimpleField),
+              ("DBLINKS", fields.DBDBLinks),
+              ("ATOM", fields.DBSimpleField),
+              ("BOND", fields.DBSimpleField)
+              ]
+    
+    
+class Compounds(DBDataBase):
+    DB = "cpd"
+    ENTRY_TYPE = CompoundEntry
+    
+    def __init__(self):
+        DBDataBase.__init__(self)
+        self._keys = [] # All keys are not available
+
+
+@entry.entry_decorate    
+class ReactionEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("DEFINITION", fields.DBDefinitionField),
+              ("EQUATION", fields.DBSimpleField),
+              ("ENZYME", fields.DBSimpleField)
+              ]
+    
+class Reactions(DBDataBase):
+    DB = "rn"
+    ENTRY_TYPE = ReactionEntry
+    
+    def __init__(self):
+        DBDataBase.__init__(self)
+        self._keys = [] # All keys are not available
+         
+class Brite(DBDataBase):
+    DB = "br"
+    
+class Disease(DBDataBase):
+    DB = "ds"
+        
+class Drug(DBDataBase):
+    DB = "dr"
+    
+@entry.entry_decorate
+class EnzymeEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("CLASS", fields.DBSimpleField),
+              ("SYSNAME", fields.DBSimpleField),
+              ("REACTION", fields.DBSimpleField),
+              ("ALL_REAC", fields.DBSimpleField),
+              ("SUBSTRATE", fields.DBSimpleField),
+              ("PRODUCT", fields.DBSimpleField),
+              ("COMMENT", fields.DBSimpleField),
+              ("REFERENCE", fields.DBReference),
+              ("PATHWAY", fields.DBPathway),
+              ("ORTHOLOGY", fields.DBSimpleField),
+              ("GENES", fields.DBSimpleField),
+              ("DBLINKS", fields.DBDBLinks)
+              ]
+    
+    MULTIPLE_FIELDS = ["REFERENCE"]
+    
+class Enzymes(DBDataBase):
+    DB = "ec"
+    ENTRY_TYPE = EnzymeEntry
+    
+    
+@entry.entry_decorate
+class OrthologyEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("CLASS", fields.DBSimpleField),
+              ("DBLINKS", fields.DBDBLinks),
+              ("GENES", fields.DBSimpleField),
+              ]
+    
+class Orthology(DBDataBase):
+    DB = "ko"
+    ENTRY_TYPE = OrthologyEntry
+    
+    
+@entry.entry_decorate
+class PathwayEntry(entry.DBEntry):
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ("NAME", fields.DBNameField),
+              ("DESCRIPTION", fields.DBSimpleField),
+              ("CLASS", fields.DBSimpleField),
+              ("PATHWAY_MAP", fields.DBPathwayMapField),
+              ("DISEASE", fields.DBSimpleField),
+              ("DRUG", fields.DBSimpleField),
+              ("DBLINKS", fields.DBDBLinks),
+              ("ORGANISM", fields.DBSimpleField),
+              ("GENE", fields.DBGeneField),
+              ("ENZYME", fields.DBEnzymeField),
+              ("COMPOUND", fields.DBCompoundField),
+              ("REFERENCE", fields.DBReference),
+              ("REL_PATHWAY", fields.DBSimpleField),
+              ("KO_PATHWAY", fields.DBSimpleField),
+              ]
+    
+    MULTIPLE_FIELDS = ["REFERENCE"]
+    
+    @property
+    def gene(self):
+        if hasattr(self, "GENE"):
+            genes = self.GENE._convert()
+        else:
+            return None
+        
+        org = self.organism
+        org_prefix = ""
+        if org:
+            match = re.findall(r"\[GN:([a-z]+)\]", org)
+            if match:
+                org_prefix = match[0] + ":"
+        genes = [org_prefix + g for g in genes]
+        return genes 
+    
+class Pathways(DBDataBase):
+    DB = "path"
+    ENTRY_TYPE = PathwayEntry
+    
+    def __init__(self):
+        DBDataBase.__init__(self)
+    

_bioinformatics/obiKEGG/entry/__init__.py

+"""
+DBGET entry 
+"""
+from __future__ import absolute_import
+
+__all__ = ["parser", "fields"]
+
+from collections import defaultdict
+
+from . import fields
+from .parser import DBGETEntryParser
+
+def entry_decorate(cls):
+    """ Decorate the DBEntry subclass with properties for acessing
+    the fields through the 'DBField._convert' interface
+     
+    """
+    reserved_names_map = {"class": "class_", "def": "def_"}
+    def construct_one(name):
+        def get(self):
+            field = getattr(self, name, None)
+            if field is not None:
+                return field._convert()
+            else:
+                return None
+        return property(get, doc=name)
+    
+    def construct_multiple(name):
+        def get(self):
+            field = getattr(self, name, None)
+            if field is not None:
+                return [f._convert() for f in field]
+            else:
+                return None
+        return property(get, doc=name)
+    
+    for name, field in cls.FIELDS:
+        name_lower = name.lower()
+        if not hasattr(cls, name_lower):
+            if name in cls.MULTIPLE_FIELDS:
+                prop = construct_multiple(name)
+            else:
+                prop = construct_one(name)
+            setattr(cls, reserved_names_map.get(name_lower, name_lower), prop)
+        
+    return cls
+
+class DBEntry(object):
+    """ A DBGET entry object.
+    """
+    FIELDS = [("ENTRY", fields.DBEntryField),
+              ]
+    MULTIPLE_FIELDS = []
+    
+    def __init__(self, text=None):
+        self._sections = {}
+        if text is not None:
+            self.parse(text)
+            
+    @property
+    def entry_key(self):
+        """ Primary entry key used for querying.
+        """
+        return self.entry.split(" ", 1)[0]
+    
+    def parse(self, text):
+        parser = DBGETEntryParser()
+        gen = parser.parse_string(text)
+        entry_start = 0
+        field_constructors = dict(self.FIELDS)
+        sections = defaultdict(list)
+        current = None
+        current_subfield = None
+        entry_fields = []
+        for (event, title, text) in gen:
+#            print event, title, text
+            if event == DBGETEntryParser.SECTION_START:
+                if title in field_constructors:
+                    ftype = field_constructors[title]
+                else:
+                    ftype = fields.DBSimpleField
+                current = ftype(text)
+                if current.TITLE is None:
+                    current.TITLE = title
+            elif event == DBGETEntryParser.SECTION_END:
+                entry_fields.append(current)
+                current = None
+            elif event == DBGETEntryParser.SUBSECTION_START:
+                current_subfield = fields.DBSimpleField(text)
+                current_subfield.TITLE = title
+                if not isinstance(current, fields.DBFieldWithSubsections):
+                    # Upgrade simple fields to FieldWithSubsection
+                    new = fields.DBFieldWithSubsections(current.text)
+                    new.TITLE = current.TITLE
+                    current = new
+                    
+            elif event == DBGETEntryParser.SUBSECTION_END:
+                current.subsections.append(current_subfield)
+                current_subfield = None
+            elif event == DBGETEntryParser.TEXT:
+                if current_subfield is not None:
+                    current_subfield.text += text
+                elif current is not None:
+                    current.text += text
+            elif event == DBGETEntryParser.ENTRY_END:
+                break
+        
+        self.fields = entry_fields
+        self._consolidate()
+        
+    def _consolidate(self):
+        """ Update mapping to field entries.
+        """
+        registered_fields = dict(self.FIELDS)
+        multiple_fields = set(self.MULTIPLE_FIELDS)
+        
+        for field in self.fields:
+            title = field.TITLE
+            if title not in registered_fields:
+                import warnings
+                warnings.warn("Nonregisterd field %r in %r" % \
+                                (title, type(self)))
+            title_lower = title.lower()
+            if title in multiple_fields:
+                if not hasattr(self, title):
+                    setattr(self, title, [])
+                getattr(self, title).append(field)
+            else:
+                setattr(self, title, field)
+        
+    
+    def __str__(self):
+        return self.format()
+    
+    def format(self, section_indent=12):
+        return "".join(f.format(section_indent)\
+                       for f in self.fields)
+        
+    def get(self, key, default=None):
+        raise NotImplementedError
+    
+        f = getattr(self, key, None)
+        if f is not None:
+            if key in self.MULTIPLE_FIELDS:
+                return [f.text for f in f]
+            else:
+                return f.text
+        else:
+            return None

_bioinformatics/obiKEGG/entry/fields.py

+"""
+Wrapper classes for db entry fields to support pythoninc 
+interface.
+  
+"""
+
+class DBField(object):
+    """ Base DBGET entry field
+    """
+    __SLOTS__ = ["text"]
+    def __init__(self, text):
+        self.text = text
+        
+    def _convert(self):
+        """ Convert the contents into python representation using
+        builtin types.
+         
+        """
+        return self.text.rstrip("\n")
+    
+    
+class DBSimpleField(DBField):
+    """ Simple field (with no subsections).
+    """
+    __SLOTS__ = ["text"]
+    # TITLE must be set in subclasses or object instances
+    TITLE = None
+    def __str__(self):
+        return self.format()
+        
+        
+    def format(self, section_indent=12, subsection_indent=0):
+        fmt = (" " * subsection_indent) + "%-" + \
+              str(section_indent - subsection_indent) + \
+              "s%s"
+        text = self._indent(self.text, section_indent)
+        text = fmt % (self.TITLE, text)
+        return text
+    
+    def _indent(self, text, section_indent=12):
+        indent_str = "\n" + " " * section_indent
+        nl_count = text.count("\n")
+        return text.replace("\n", indent_str, nl_count - 1)
+    
+    
+class DBEntryField(DBSimpleField):
+    """ ENTRY field (all entries start with this field)
+    """
+    __SLOTS__ = ["text"]
+    TITLE = "ENTRY"
+    
+    
+class DBNameField(DBSimpleField):
+    __SLOTS__ = ["text"]
+    TITLE = "NAME"
+    
+    
+class DBDefinitionField(DBSimpleField):
+    __SLOTS__ = ["text"]
+    TITLE = "DEFINITION"
+    
+    
+class DBFieldWithSubsections(DBSimpleField):
+    """ A field with subsections (for instance REFERENCE in genome)
+    """
+    __SLOTS__ = ["text", "subsections"]
+    TITLE = None
+    SUBSECTIONS = None
+    
+    def __init__(self, text, subsections=None):
+        self.text = text
+        self.subsections = subsections or []
+        
+    def format(self, section_indent=12, subsection_indent=2):
+        text = DBSimpleField.format(self, section_indent, subsection_indent=0)
+        subsections = [sub.format(section_indent, subsection_indent)\
+                       for sub in self.subsections]
+        return "".join([text] + subsections)