Commits

Aleš Erjavec committed 87dbac8

Removed uses of obiGenomicsUpdate from obiGO.

Moved update code to server_update/updateGO.py

  • Participants
  • Parent commits 75ebb26

Comments (0)

Files changed (2)

File orangecontrib/bio/obiGO.py

     return set(r)
 
 
-class __progressCallbackWrapper:
-    def __init__(self, callback):
-        self.callback = callback
-
-    def __call__(self, bCount, bSize, fSize):
-        fSize = 10000000 if fSize == -1 else fSize
-        self.callback(100 * bCount * bSize / fSize)
-
-from .obiGenomicsUpdate import Update as UpdateBase
-
-
-class Update(UpdateBase):
-    def __init__(self, local_database_path=None, progressCallback=None):
-        UpdateBase.__init__(self, local_database_path or getDataDir(), progressCallback)
-
-    def CheckModified(self, addr, date=None):
-        return date < self.GetLastModified(addr) if date else True
-
-    def CheckModifiedOrg(self, org):
-        return self.CheckModified("http://www.geneontology.org/gene-associations/gene_association." + org + ".gz", self.LastModifiedOrg(org))
-
-    def LastModifiedOrg(self, org):
-        return self.shelve.get((Update.UpdateAnnotation, (org,)), None)
-
-    def GetLastModified(self, addr):
-        stream = urllib2.urlopen(addr)
-        return datetime.strptime(stream.headers.get("Last-Modified"), "%a, %d %b %Y %H:%M:%S %Z")
-##        return stream.headers.get("Last-Modified")
-
-    def GetAvailableOrganisms(self):
-        source = urllib2.urlopen("http://www.geneontology.org/gene-associations/").read()
-        return [s.split(".")[1] for s in sorted(set(re.findall("gene_association\.[a-zA-z0-9_]+?\.gz", source)))]
-
-    def GetDownloadedOrganisms(self):
-        return [name.split(".")[1] for name in os.listdir(self.local_database_path) if name.startswith("gene_association")]
-
-    def IsUpdatable(self, func, args):
-        if func == Update.UpdateOntology:
-            return self.CheckModified("http://www.geneontology.org/ontology/gene_ontology.obo", self.shelve.get((Update.UpdateOntology, ()), None))
-        elif func == Update.UpdateAnnotation:
-            return self.CheckModifiedOrg(args[0])
-
-    def GetDownloadable(self):
-        orgs = set(self.GetAvailableOrganisms()) - set(self.GetDownloadedOrganisms())
-        ret = []
-        if (Update.UpdateOntology, ()) not in self.shelve:
-            ret.append((Update.UpdateOntology, ()))
-        if orgs:
-            ret.extend([(Update.UpdateAnnotation, (org,)) for org in orgs])
-        return ret
-
-    def UpdateOntology(self):
-        Ontology.DownloadOntology(os.path.join(self.local_database_path, "gene_ontology_edit.obo.tar.gz"), self.progressCallback)
-        self._update(Update.UpdateOntology, (), self.GetLastModified("http://www.geneontology.org/ontology/gene_ontology.obo"))
-
-    def UpdateAnnotation(self, org):
-        Annotations.DownloadAnnotations(org, os.path.join(self.local_database_path, "gene_association." + org + ".tar.gz"), self.progressCallback)
-        self._update(Update.UpdateAnnotation, (org,), self.GetLastModified("http://www.geneontology.org/gene-associations/gene_association." + org + ".gz"))
-
-    def UpdateTaxonomy(self, org):
-        exclude = ["goa_uniprot", "goa_pdb", "GeneDB_tsetse", "reactome", "goa_zebrafish", "goa_rat", "goa_mouse"]
-
-        orgs = self.GetAvailableOrganisms()
-        tax = defaultdict(set)
-
-        for org in orgs:
-            if org in exclude:
-                continue
-            try:
-                a = obiGO.Annotations(os.path.join(self.local_database_path, "gene_association." + org + ".tar.gz"))
-                taxons = set(ann.taxon for ann in a.annotations)
-                for taxId in [t.split(":")[-1] for t in taxons if "|" not in t]:  # exclude taxons with cardinality 2
-                    tax[taxId].add(org)
-            except Exception, ex:
-                print ex
-
-        cPickle.dump(dict(tax), open(os.path.join(path, "taxonomy.pickle"), "wb"))
-
-
 def _test1():
 ##    Ontology.DownloadOntology("ontology_arch.tar.gz")
 ##    Annotations.DownloadAnnotations("sgd", "annotations_arch.tar.gz")

File server_update/updateGO.py

 ##!interval=7
 ##!contact=ales.erjavec@fri.uni-lj.si
 
+import urllib2
+import re
+import cPickle
+import tarfile
+
+from datetime import datetime
+from collections import defaultdict
+
 from common import *
 
-from Orange.bio import obiGO, obiTaxonomy, obiGene, obiGenomicsUpdate
+from Orange.bio import obiGO, obiTaxonomy, obiGene
 
-import urllib2, tarfile
 
-from collections import defaultdict
-
-tmpDir = os.path.join(environ.buffer_dir, "tmp_GO")
+tmp_path = os.path.join(environ.buffer_dir, "tmp_GO")
 try:
-    os.mkdir(tmpDir)
+    os.makedirs(tmp_path)
 except Exception:
     pass
 
-u = obiGO.Update(local_database_path = tmpDir)
-
 uncompressedSize = lambda filename: sum(info.size for info in tarfile.open(filename).getmembers())
 
-def pp(*args, **kw): print args, kw
+DATE_FMT_1 = "%Y-%m-%d %H:%M:%S.%f"
+DATE_FMT_2 = "%Y-%m-%d %H:%M:%S"
 
-if u.IsUpdatable(obiGO.Update.UpdateOntology, ()):
-    u.UpdateOntology()
-    filename = os.path.join(tmpDir, "gene_ontology_edit.obo.tar.gz")
+
+def info_date_time_parse(time):
+    """
+    Parse a "datetime" field from the sf info record into a datetime.datetime.
+    """
+    try:
+        return datetime.strptime(time, DATE_FMT_1)
+    except ValueError:
+        return datetime.strptime(time, DATE_FMT_2)
+
+
+def http_last_modified(url):
+    """
+    Retrieve a "last-modified" time for the url as a datetime.datetime object.
+    """
+    stream = urllib2.urlopen(url)
+    return datetime.strptime(stream.headers.get("Last-Modified"),
+                             "%a, %d %b %Y %H:%M:%S %Z")
+
+
+def list_available_organisms():
+    """
+    Return a list of all available GO organism codes.
+    """
+    source = urllib2.urlopen("http://www.geneontology.org/gene-associations/").read()
+    codes = re.findall("gene_association\.([a-zA-z0-9_]+?)\.gz", source)
+    return sorted(set(codes))
+
+
+def sf_org_mtime(org_code):
+    info = sf_server.info("GO", "gene_association.{}.tar.gz".format(org_code))
+    return info_date_time_parse(info["datetime"])
+
+
+def web_org_mtime(org_code):
+    return http_last_modified(
+        "http://www.geneontology.org/gene-associations/gene_association.{}.gz"
+        .format(org_code))
+
+
+def sf_ontology_mtime():
+    info = sf_server.info("GO", "gene_ontology_edit.obo.tar.gz")
+    return info_date_time_parse(info["datetime"])
+
+
+def web_ontology_mtime():
+    return http_last_modified(
+        "http://www.geneontology.org/ontology/gene_ontology.obo")
+
+
+if web_ontology_mtime() > sf_ontology_mtime():
+    filename = os.path.join(tmp_path, "gene_ontology_edit.obo.tar.gz")
+    obiGO.Ontology.DownloadOntology(filename)
+
     ##load the ontology to test it
     o = obiGO.Ontology(filename)
     del o
     ##upload the ontology
     print "Uploading gene_ontology_edit.obo.tar.gz"
-    sf_server.upload("GO", "gene_ontology_edit.obo.tar.gz", filename, title = "Gene Ontology (GO)",
-                       tags=["gene", "ontology", "GO", "essential", "#uncompressed:%i" % uncompressedSize(filename), "#version:%i" % obiGO.Ontology.version])
+    sf_server.upload(
+        "GO", "gene_ontology_edit.obo.tar.gz", filename,
+        title="Gene Ontology (GO)",
+        tags=["gene", "ontology", "GO", "essential",
+              "#uncompressed:%i" % uncompressedSize(filename),
+              "#version:%i" % obiGO.Ontology.version]
+    )
     sf_server.unprotect("GO", "gene_ontology_edit.obo.tar.gz")
 
-#from obiGeneMatch import _dbOrgMap
-#
-#exclude = ["goa_uniprot", "goa_pdb", "GeneDB_tsetse", "reactome", "goa_zebrafish", "goa_rat", "goa_mouse"]
-#lines = [line.split("\t") for line in urllib2.urlopen("ftp://ftp.genome.jp/pub/kegg/genes/taxonomy").readlines() if not line.startswith("#")]
-#keggOrgNames = dict([(line[1].strip(), line[-1][:-5].strip().replace("(", "").replace(")", "") if line[-1].endswith("(EST)\n") else line[-1].strip()) for line in lines if len(line)>1])
 
-#additionalNames = {"goa_arabidopsis":"Arabidopsis thaliana", "sgn":"Solanaceae", "PAMGO_Oomycetes":"Oomycete"}
-#essentialOrgs = ["goa_human", "sgd", "mgi", "dictyBase"]
+orgMap = {"352472": "44689", "562": "83333", "3055": None,
+          "7955": None, "11103": None, "2104": None, "4754":
+          None, "31033": None, "8355": None, "4577": None}
 
-orgMap = {"352472":"44689", "562":"83333", "3055":None, "7955":None, "11103":None, "2104":None, "4754":None, "31033":None, "8355":None, "4577":None}
+commonOrgs = dict([(obiGO.from_taxid(id), id)
+                   for id in obiTaxonomy.common_taxids()
+                   if obiGO.from_taxid(id) != None])
 
-#commonOrgs = dict([(obiGO.from_taxid(orgMap.get(id, id)).pop(), orgMap.get(id, id)) for id in obiTaxonomy.common_taxids() if orgMap.get(id, id) != None])
-commonOrgs = dict([(obiGO.from_taxid(id), id) for id in obiTaxonomy.common_taxids() if obiGO.from_taxid(id) != None])
+essentialOrgs = [obiGO.from_taxid(id)
+                 for id in obiTaxonomy.essential_taxids()]
 
-essentialOrgs = [obiGO.from_taxid(id) for id in obiTaxonomy.essential_taxids()]
-
-exclude = ["goa_uniprot", "goa_pdb", "GeneDB_tsetse", "reactome", "goa_zebrafish", "goa_rat", "goa_mouse"]
+exclude = ["goa_uniprot", "goa_pdb", "GeneDB_tsetse", "reactome",
+           "goa_zebrafish", "goa_rat", "goa_mouse"]
 
 updatedTaxonomy = defaultdict(set)
 
-for org in u.GetAvailableOrganisms():
+
+for org in list_available_organisms():
     if org in exclude or org not in commonOrgs:
         continue
 
-    if u.IsUpdatable(obiGO.Update.UpdateAnnotation, (org,)):
-        u.UpdateAnnotation(org)
-        filename = os.path.join(tmpDir, "gene_association." + org + ".tar.gz")
-        
-        ## Load the annotations to test them and collect all taxon ids from them
-        a = obiGO.Annotations(filename, genematcher=obiGene.GMDirect())
-        taxons = set([ann.taxon for ann in a.annotations])
-        for taxId in [t.split(":")[-1] for t in taxons if "|" not in t]: ## exclude taxons with cardinality 2
-            updatedTaxonomy[taxId].add(org)
-        del a
-        ## Upload the annotation
-#        if org in _dbOrgMap:
-#            orgName = keggOrgNames[_dbOrgMap[org]].split("(")[0].strip()
-#        elif org in additionalNames:
-#            orgName = additionalNames[org]
-#        else:
-#            orgName = org
-        orgName = obiTaxonomy.name(commonOrgs[org])
-        taxid = obiTaxonomy.taxname_to_taxid(orgName)
-#            print "unknown organism name translation for:", org
-        print "Uploading", "gene_association." + org + ".tar.gz"
-        sf_server.upload("GO", "gene_association." + org + ".tar.gz", filename, title = "GO Annotations for " + orgName,
-                           tags=["gene", "annotation", "ontology", "GO", orgName, "#uncompressed:%i" % uncompressedSize(filename),
-                                 "#organism:"+orgName, "#version:%i" % obiGO.Annotations.version] + (["essential"] if org in essentialOrgs else []) + obiTaxonomy.shortname(taxid))
-        sf_server.unprotect("GO", "gene_association." + org + ".tar.gz")
-        
+    if web_org_mtime(org) <= sf_org_mtime(org):
+        # Skip update
+        continue
+
+    filename = os.path.join(tmp_path, "gene_association." + org + ".tar.gz")
+    obiGO.Annotations.DownloadAnnotations(org, filename)
+
+    ## Load the annotations to test them and collect all taxon ids from them
+    a = obiGO.Annotations(filename, genematcher=obiGene.GMDirect())
+    taxons = set([ann.taxon for ann in a.annotations])
+    ## exclude taxons with cardinality 2
+    taxons = [tax for tax in taxons if "|" not in tax]
+    for tax in taxons:
+        taxid = tax.split(":", 1)[-1]
+        updatedTaxonomy[taxid].add(org)
+    del a
+
+    orgName = obiTaxonomy.name(commonOrgs[org])
+    taxid = obiTaxonomy.taxname_to_taxid(orgName)
+
+    print "Uploading", "gene_association." + org + ".tar.gz"
+    sf_server.upload(
+        "GO", "gene_association." + org + ".tar.gz", filename,
+        title="GO Annotations for " + orgName,
+        tags=["gene", "annotation", "ontology", "GO", orgName,
+              "#uncompressed:%i" % uncompressedSize(filename),
+              "#organism:" + orgName,
+              "#version:%i" % obiGO.Annotations.version] +
+             (["essential"] if org in essentialOrgs else []) +
+             obiTaxonomy.shortname(taxid)
+    )
+    sf_server.unprotect("GO", "gene_association." + org + ".tar.gz")
+
 try:
-    import cPickle
-#    tax = cPickle.load(open(os.path.join(tmpDir, "taxonomy.pickle"), "rb"))
     tax = cPickle.load(open(sf_local.localpath_download("GO", "taxonomy.pickle"), "rb"))
 except Exception:
     tax = {}
 
 ## Upload taxonomy if any differences in the updated taxonomy
-if any(tax.get(key, set()) != updatedTaxonomy.get(key, set()) for key in set(updatedTaxonomy)):
+if any(tax.get(key, set()) != updatedTaxonomy.get(key, set())
+       for key in set(updatedTaxonomy)):
     tax.update(updatedTaxonomy)
-    cPickle.dump(tax, open(os.path.join(tmpDir, "taxonomy.pickle"), "wb"))
+    cPickle.dump(tax, open(os.path.join(tmp_path, "taxonomy.pickle"), "wb"))
     print "Uploading", "taxonomy.pickle"
-    sf_server.upload("GO", "taxonomy.pickle", os.path.join(tmpDir, "taxonomy.pickle"), title="GO taxon IDs",
-                       tags = ["GO", "taxon", "organism", "essential", "#version:%i" % obiGO.Taxonomy.version])
+    sf_server.upload(
+        "GO", "taxonomy.pickle", os.path.join(tmp_path, "taxonomy.pickle"),
+        title="GO taxon IDs",
+        tags=["GO", "taxon", "organism", "essential",
+              "#version:%i" % obiGO.Taxonomy.version])
     sf_server.unprotect("GO", "taxonomy.pickle")