Commits

Aleš Erjavec committed ce65c16

Added PPIDatabase like interface for GeneMania.

  • Participants
  • Parent commits 8a0169e

Comments (0)

Files changed (1)

File obiGeneMania.py

     """
     return Connection().retrieve(org, genes, m, r)
 
+
+"""
+======================
+PPI Database interface
+======================
+
+"""
+
+import obiPPI
+import orngServerFiles
+import sqlite3
+import csv
+import os
+import posixpath
+
+from contextlib import contextmanager
+import StringIO
+
+@contextmanager
+def finishing(obj):
+    """ Calls obj.finish() on context exit.
+    """
+    yield obj
+    obj.finish()
+
+def guess_size(fileobj):
+    try:
+        if isinstance(fileobj, file):
+            return os.fstat(fileobj.fileno()).st_size
+        elif isinstance(fileobj, StringIO.StringIO):
+            pos = fileobj.tell()
+            fileobj.seek(0, 2)
+            length = fileobj.tell() - pos
+            fileobj.seek(pos, 0)
+            return length
+        elif isinstance(fileobj, urllib.addinfourl):
+            length = fileobj.headers.get("content-length", None)
+            return length
+    except Exception, ex:
+        pass
+
+
+def copyfileobj(src, dst, buffer=2**10, content_len=None, progress=None):
+    count = 0
+    if content_len is None:
+        content_len = guess_size(src) or sys.maxint
+    while True:
+        data = src.read(buffer)
+        dst.write(data)
+        count += len(data)
+        if progress:
+            progress(100.0 * count / content_len)
+        if not data:
+            break
+            
+            
+def wget(url, directory=".", dst_obj=None, progress=None):
+    """
+    .. todo:: Move too Orange.misc
+    """
+    stream = urllib2.urlopen(url)
+    length = stream.headers.get("content-length", None)
+    if length is None:
+        length = sys.maxint
+    else:
+        length = int(length)
+    
+    basename = posixpath.basename(url)
+        
+    if dst_obj is None:
+        dst_obj = open(os.path.join(directory, basename), "wb")
+    
+    if progress == True:
+        from Orange.misc import ConsoleProgressBar
+        progress = ConsoleProgressBar("Downloading %r." % basename)
+        with finishing(progress):
+            copyfileobj(stream, dst_obj, buffer=2**10, content_len=length,
+                        progress=progress)
+    else:
+        copyfileobj(stream, dst_obj, buffer=2**10, content_len=length,
+                    progress=progress)
+    
+import obiTaxonomy
+from collections import namedtuple
+from operator import itemgetter
+from Orange.misc import lru_cache
+
+GENE_MANIA_INTERACTION_FIELDS = \
+    ["gene_a", "gene_b", "weight", "network_name",
+     "network_group", "source", "pubmed_id"]
+     
+GeneManiaInteraction = namedtuple("GeneManiaInteraction",
+                                  field_names=GENE_MANIA_INTERACTION_FIELDS
+                                 )
+
+import weakref
+class Internalizer(object):
+    """ A class that acts as the python builtin function ``intern``,
+    for as long as it is alive.
+    
+    .. note:: This is for memory optimization only, it does not affect 
+        dict lookup speed.
+    
+    """
+    def __init__(self):
+        self._intern_dict = {}
+        
+    def __call__(self, obj):
+        return self._intern_dict.setdefault(obj, obj)
+    
+class GeneManiaDatabase(obiPPI.PPIDatabase):
+    DOMAIN = "PPI"
+    SERVER_FILE = "gene-mania-{taxid}.sqlite"
+    
+    TAXID2NAME = ""
+    
+    # DB schema
+    SCHEMA = """
+    table: `genes`
+        - `internal_id`: int (pk)
+        - `gene_name`: text (preferred name)
+        
+    table: `synonyms`:
+        - `internal_id: int (foreign key `genes.internal_id`)
+        - `synonym`: text
+        - `source_id`: int
+        
+    table: `source`:
+        - `source_id`: int
+        - `source_name`: text
+        
+    table: `links`:
+        - `gene_a`: int (fk `genes.internal_key`)
+        - `gene_b`: int (fk `genes.internal_key`)
+        - `network_id`: (fk `networks.network_id`)
+        - `weight`: real
+        
+    table: `networks`:
+        - `network_id`: int
+        - `network_name`: text
+        - `network_group`: text
+        - `source`: text
+        - `pubmed_id`: text
+        
+    view: `links_annotated`:
+        - `gene_name_a`
+        - `gene_name_b`
+        - `network_name`
+        - `network_group`
+        - `weight`
+        
+    """
+    
+    def __init__(self, taxid):
+        self.taxid = taxid
+        
+    @classmethod
+    def common_taxids(self):
+        return ["3702", "6239", "7227", "9606", "10090", "10116", "4932"]
+    
+    def organisms(self):
+        """ Return all organism taxids contained in this database.
+        
+        .. note:: a single taxid is returned (the one used at
+            instance initialization)   
+        
+        """
+        return [self.taxid]
+    
+    def ids(self, taxid=None):
+        """ Return all primary ids for `taxid`.
+        """
+        if taxid is None:
+            taxids = self.organisms()
+            return reduce(list.__add__, map(self.ids, taxids), [])
+        
+        con = self._db(taxid)
+        cur = con.execute("""\
+            SELECT gene_name FROM genes
+            """)
+        return map(itemgetter(0), cur)
+        
+    def synonyms(self, id):
+        """ Return a list of synonyms for primary `id`.
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("""\
+            SELECT synonyms.synonym
+            FROM synonyms NATURAL LEFT JOIN genes
+            WHERE genes.gene_name=?
+            """, (id,))
+        return map(itemgetter(0), cur)
+        
+    def all_edges(self, taxid=None):
+        """ Return a list of all edges.
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("""
+            SELECT links.gene_a, links.gene_b, links.weight
+            FROM links""")
+        id_to_name = self._gene_id_to_name()
+        return [(id_to_name[r[0]], id_to_name[r[1]], r[2]) \
+                for r in cur]
+        
+    def all_edges_annotated(self, taxid=None):
+        """ Return a list of all edges with all available annotations
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("""\
+            SELECT links.gene_a, links.gene_b, links.weight, links.network_id
+            FROM links""")
+        gene_to_name = self._gene_id_to_name()
+        network_to_description = self._network_id_to_description()
+        res = []
+        for gene_a, gene_b, w, n_id in cur:
+            n_desc = network_to_description[n_id]
+            
+            res.append(GeneManiaInteraction(gene_to_name[gene_a],
+                            gene_to_name[gene_b], w, *n_desc))
+        return res
+        
+    def edges(self, id1):
+        """ Return all edges for primary id `id1`.
+        """        
+        con = self._db(self.taxid)
+        cur = con.execute("""\
+            SELECT genes1.gene_name, genes2.gene_name, links.weight
+            FROM genes AS genes1  
+                JOIN links
+                    ON genes1.internal_id=links.gene_a
+                JOIN genes AS genes2
+                    ON genes2.internal_id=links.gene_b
+            WHERE genes1.gene_name=?
+            """, (id1,))
+        res = cur.fetchall()
+        cur = con.execute("""\
+            SELECT genes1.gene_name, genes2.gene_name, links.weight
+            FROM genes AS genes1  
+                JOIN  links
+                    ON genes1.internal_id=links.gene_a
+                JOIN genes AS genes2
+                    ON genes2.internal_id=links.gene_b
+            WHERE genes2.gene_name=?
+            """, (id1,))
+        res += cur.fetchall()
+        
+        return res
+    
+    def edges_annotated(self, id=None):
+        """ Return a list of annotated edges for primary `id` 
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("""\
+            SELECT genes1.gene_name, genes2.gene_name, links.weight,
+                   networks.network_name, networks.network_group,
+                   networks.source, networks.pubmed_id
+            FROM genes AS genes1
+                JOIN  links
+                    ON genes1.internal_id=links.gene_a
+                JOIN genes AS genes2
+                    ON genes2.internal_id=links.gene_b
+                NATURAL JOIN networks
+            WHERE genes1.gene_name=?
+            """, (id,))
+        res = cur.fetchall()
+        cur = con.execute("""\
+            SELECT genes1.gene_name, genes2.gene_name, links.weight,
+                   networks.network_name, networks.network_group,
+                   networks.source, networks.pubmed_id
+            FROM genes AS genes1
+                JOIN links
+                    ON genes1.internal_id=links.gene_a
+                JOIN genes AS genes2
+                    ON genes2.internal_id=links.gene_b
+                NATURAL JOIN networks
+            WHERE genes2.gene_name=?
+            """, (id,))
+        res += cur.fetchall()
+        return [GeneManiaInteraction(*r) for r in res]
+    
+    def search_id(self, name, taxid=None):
+        """ Search the database for gene name. Return a list of matching 
+        primary ids. Use `taxid` to limit the results to a single organism.
+        
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("""\
+            SELECT genes.gene_name
+            FROM genes NATURAL JOIN synonyms
+            WHERE synonyms.synonym=? 
+            """, (name,))
+        return map(itemgetter(0), cur)
+        
+    def _db(self, taxid=None):
+        """ Return an open sqlite3.Connection object.  
+        """
+        taxid = taxid or self.taxid
+        filename = orngServerFiles.localpath_download("PPI",
+                            self.SERVER_FILE.format(taxid=taxid))
+        if not os.path.exists(filename):
+            raise ValueError("Database is missing.")
+        
+        return sqlite3.connect(filename)
+    
+    @lru_cache(maxsize=1)
+    def _gene_id_to_name(self):
+        """ Return a dictionary mapping internal gene ids to 
+        primary gene identifiers.
+        
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("SELECT * FROM genes")
+        return dict(cur)
+    
+    @lru_cache(maxsize=1)
+    def _network_id_to_description(self):
+        """ Return a dictionary mapping internal network ids
+        to (name, group, source, pubmed id).
+         
+        """
+        con = self._db(self.taxid)
+        cur = con.execute("SELECT * FROM networks")
+        return dict((t[0], t[1:]) for t in cur)
+    
+    #####################################
+    # Data download and DB initialization
+    #####################################
+     
+    @classmethod
+    def download_data(cls, taxid=None, progress_callback=None):
+        """ Download the data for ``taxid`` from the GeneMANIA
+        website and initialize the local database.
+        
+        """
+        import tarfile
+        
+        baseurl = "http://genemania.org/data/current/"
+        directory = orngServerFiles.localpath("PPI")
+        if taxid is None:
+            taxid = cls.common_taxids()
+        
+        if isinstance(taxid, (list, tuple)):
+            taxids = taxid
+        else:
+            taxids = [taxid]
+        for taxid in taxids:
+            name = obiTaxonomy.name(taxid)
+            name = name.replace(" ", "_")
+            
+            if progress_callback is None:
+                progress = True #orngServerFiles.ConsoleProgressBar("Downloading %r." % filename)
+            else:
+                progress = progress_callback
+            
+            filename = name + ".tgz"
+            url = baseurl + "networks/" + filename    
+            wget(url, directory=directory, progress=progress)
+            
+            tgz_filename = os.path.join(directory, filename)    
+            tgz = tarfile.open(tgz_filename)
+            tgz.extractall(directory)
+            
+            filename = name + ".COMBINED.tgz"
+            url = baseurl + "precombined/" + filename
+            wget(url, directory=directory, progress=progress)
+            
+            tgz_filename = os.path.join(directory, filename)
+            tgz = tarfile.open(tgz_filename)
+            tgz.extractall(directory)
+        
+            cls.init_db([taxid])
+        
+    @classmethod
+    def init_db(cls, taxid=None):
+        """ Init the local data base.
+        """
+        from functools import partial
+        directory = orngServerFiles.localpath("PPI")
+        pjoin = partial(os.path.join, directory)
+        if taxid is None:
+            taxid = cls.common_taxids()
+            
+        if isinstance(taxid, (list, tuple)):
+            for tid in taxid:
+                cls.init_db(tid)
+            return
+                
+        if not isinstance(taxid, basestring):
+            raise ValueError("wrong taxid")
+            
+#        taxid = taxids
+        name = obiTaxonomy.name(taxid).replace(" ", "_")
+        networks = csv.reader(open(pjoin(name, "networks.txt")), delimiter="\t")
+        networks.next() # Header
+        networks = list(networks)
+        
+        database = pjoin(cls.SERVER_FILE.format(taxid=taxid))
+        with sqlite3.connect(database) as con:
+            con.execute("""DROP TABLE IF EXISTS genes""")
+            con.execute("""DROP TABLE IF EXISTS synonyms""")
+            con.execute("""DROP TABLE IF EXISTS source""")
+            con.execute("""DROP TABLE IF EXISTS links""")
+            con.execute("""DROP TABLE IF EXISTS networks""")
+            
+            con.execute("""DROP INDEX IF EXISTS genes_index""")
+            con.execute("""DROP INDEX IF EXISTS links_index_a""")
+            con.execute("""DROP INDEX IF EXISTS links_index_b""")
+            
+            con.execute("""\
+                CREATE TABLE networks 
+                    (network_id INTEGER PRIMARY KEY ASC AUTOINCREMENT,
+                     network_name TEXT,
+                     network_group TEXT,
+                     source TEXT,
+                     pubmed_id TEXT
+                    )""")
+            
+            con.executemany("""\
+                INSERT INTO networks
+                VALUES (?, ?, ?, ?, ?)""", [(i, r[2], r[1], r[3], r[4]) \
+                                        for i, r in enumerate(networks)])
+            
+            con.execute("""\
+                CREATE TABLE genes
+                    (internal_id INTEGER PRIMARY KEY ASC AUTOINCREMENT,
+                     gene_name TEXT
+                    )""")
+            
+            identifiers = csv.reader(open(pjoin(name, "identifier_mappings.txt"), "rb"),
+                                    delimiter="\t")
+            identifiers.next() # skip header
+            identifiers = list(identifiers)
+            genes = sorted(set(r[0] for r in identifiers))
+            sources = sorted(set(r[2] for r in identifiers))
+            
+            con.executemany("""\
+                INSERT INTO genes
+                VALUES (?, ?)""", enumerate(genes))
+            
+            con.execute("""\
+            CREATE TABLE source
+                (source_id INTEGER PRIMARY KEY ASC AUTOINCREMENT,
+                 source_name TEXT
+                )""")
+            
+            con.executemany("""\
+                INSERT INTO source
+                VALUES (?, ?)""", enumerate(sources))
+            
+            con.execute("""\
+                CREATE TABLE synonyms
+                    (internal_id INTEGER REFERENCES genes (internal_id),
+                     synonym TEXT,
+                     source_id INT REFERENCES source (source_id)
+                    )""")
+            
+            gene_to_id = dict((g, i) for i, g in enumerate(genes))
+            source_to_id = dict((s, i) for i, s in enumerate(sources))
+            con.executemany("""\
+                INSERT INTO synonyms
+                VALUES (?, ?, ?)""", [(gene_to_id[r[0]], r[1], source_to_id[r[2]])\
+                                       for r in identifiers])
+            
+            con.execute("""\
+                CREATE TABLE links
+                    (gene_a INTEGER REFERENCES genes (internal_id),
+                     gene_b INTEGER REFERENCES genes (internal_id),
+                     network_id INTEGER REFERENCES networks (network_id),
+                     weight REAL
+                     -- , PRIMARY KEY (gene_a, gene_b, network_id)
+                    )""")
+            
+            for i, (filename, group, _, _, _) in enumerate(networks):
+                nf  = open(pjoin(name, filename), "rb")
+                interactions = csv.reader(nf, delimiter="\t")
+                interactions.next() # skip header
+                con.executemany("""\
+                    INSERT INTO links 
+                    VALUES (?, ?, ?, ?)""",
+                    [(gene_to_id[r[0]], gene_to_id[r[1]], i, float(r[2])) \
+                     for r in interactions]
+                )
+                
+            # Add special combined network entry
+            combined_id = len(networks)
+            con.execute("""\
+                INSERT INTO networks
+                VALUES (?, ?, ?, ?, ?)""", 
+                (combined_id, "BP_COMBINING", "COMBINED", "GeneMANIA", ""))
+            
+            # Add the combined network links.
+            combined = open(pjoin(name + ".COMBINED", "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"), "rb")
+            combined = csv.reader(combined, delimiter="\t")
+            combined.next()
+            con.executemany("""\
+                INSERT INTO links
+                VALUES (?, ?, ?, ?)""",
+                    ((gene_to_id[r[0]], gene_to_id[r[1]], combined_id, float(r[2])) \
+                     for r in combined))
+            
+            
+            con.execute("""
+                CREATE VIEW IF NOT EXISTS links_annotated
+                AS SELECT genes1.gene_name AS gene_name_a,
+                          genes2.gene_name AS gene_name_b,
+                          links.weight,
+                          networks.network_name,
+                          networks.network_group,
+                          networks.source,
+                          networks.pubmed_id
+                   FROM  genes AS genes1
+                        JOIN links 
+                              ON genes1.internal_id=links.gene_a
+                        JOIN genes AS genes2
+                              ON links.gene_b=genes2.internal_id
+                        JOIN networks
+                              ON links.network_id=networks.network_id
+                    """)
+            
+            
+            con.execute("""\
+                CREATE INDEX IF NOT EXISTS genes_index ON genes (gene_name)
+                """)
+            con.execute("""\
+                CREATE INDEX IF NOT EXISTS links_index_a ON links (gene_a)
+                """)
+            con.execute("""\
+                 CREATE INDEX IF NOT EXISTS links_index_b ON links (gene_b)
+                """)
+        
+            
 if __name__ == "__main__":
     retrieve("9606", [ 'MRE11A', 'RAD51', 'MLH1', 'MSH2', 'DMC1', 'RAD51AP1', 'RAD50', 'MSH6', 'XRCC3', 'PCNA', 'XRCC2' ])