Snippets

Dénes Türei enriched pathways in neighborhood of drug targets

Created by Dénes Türei last modified
#!/usr/bin/env python

# Dénes Türei EMBL 2017
# turei.denes@gmail.com

import itertools
import collections
import fisher

import pypath

infile = 'lukas_drugs_targets.csv'
outfile = 'lukas_drugs_pathways.csv'
fisher_max_pval = .2

def read_data(infile):
    
    with open(infile, 'r') as fp:
        
        data = [[c.strip() for c in l.split(';')] for l in fp.read().split('\n')[1:]]
    
    for l in data:
        
        # removing excess commas and spaces
        # converting gene names to lists
        l[1] = list(map(lambda n: n.strip(), l[1].replace(',', '').split('/')))
    
    return data

def genesymbol_startswith(search, mapper):
    
    # collecting all gene names like `MYO...`
    u = set([])
    
    for gs, us in mapper.tables[9606][('genesymbol', 'swissprot')].mapping['to'].items():
        
        if gs.startswith(search):
            u.update(set(us))
    
    return u

def map_names(data, mapper):
    
    # non standard names:
    othernames = {
        'c-Met': 'MET',
        'PHD1':  'EGLN2',
        'c-Kit': 'KIT',
        'ACK':   'TNK1',
        'Tie2':  'TEK'
    }
    dnapol = ['P28340', 'P49005', 'Q15054', 'Q9HCU8']
    
    for l in data:
        
        # append a list to contain UniProtACs
        l.append([])
        
        for gs in l[1]:
            
            # 3 special cases
            if gs == 'DNA Polymerase':
                l[-1].extend(dnapol)
                continue
            
            if gs == 'MYO':
                l[-1].extend(genesymbol_startswith('MYO', mapper))
                continue
            
            if gs in othernames:
                gs = othernames[gs]
            
            # translate standard gene names and add uniprots to the list
            u = mapper.map_name(gs, 'genesymbol', 'uniprot')
            l[-1].extend(u)


def targets_lookup(data, pa):
    
    for l in data:
        
        # creating lists to contain the targets vertex ids
        l.append([])
        
        for t in l[5]:
            
            v = pa.uniprot(t)
            
            # if the lookup successful add the id
            if v: l[6].append(v.index)


def pathway_memberships(data, pa):
    
    for l in data:
        
        # using pathways from these 2 databases
        for db in ['kegg', 'signor']:
            
            # this set is to contain the pathways of the targets
            l.append(set([]))
            
            for vid in l[6]:
                
                # for each target node add the pathways to the set
                l[-1].update(pa.graph.vs['%s_pathways' % db][vid])


def score_pathways(data, pa):
    
    for db in ['kegg', 'signor']:
        
        # this is the igraph vertex attribute name
        pws = '%s_pathways' % db
        
        # now iterating data with a progress bar
        for l in pypath.progress.tqdm.tqdm(
            data, 'Calculating enrichment in %s' % db):
            
            # to store the results of the tests
            tests = []
            
            # how many targets this compound has
            numof_targets = len(l[6])
            # vertex ids of the neighbors
            targets_neighbors = set(itertools.chain(
                *pa.graph.neighborhood(l[6])
            ))
            # and the targets themselves
            targets_neighbors.update(set(l[6]))
            # count the occurances of pathways in the neighborhood
            pathways_in_neighborhood = collections.Counter(
                itertools.chain(
                    *[list(pa.graph.vs[vid][pws]) for vid in targets_neighbors]
                )
            )
            
            for pw, n_neighb in pathways_in_neighborhood.items():
                
                # vertex ids of all members of this pathway
                pw_memb = set(pa.pathway_members(pw, db).ids())
                
                # a contingency table
                contingency = [
                    n_neighb, # members of pathway in neighborhood
                    len(targets_neighbors) - n_neighb, # neighborhood
                                                       # not part of this
                                                       # pathway
                    len(pw_memb) - n_neighb, # pathway members not in the
                                             # neighborhood
                    (pa.graph.vcount() - len(pw_memb) - # all the remaining
                     len(targets_neighbors) + n_neighb) # proteins in the
                                                        # network
                ]
                
                # do fisher test
                pval = fisher.cfisher.pvalue(*contingency).right_tail
                
                # if p > 20% we can drop it
                if pval < fisher_max_pval:
                    tests.append((pw, pval))
            
            # sorting by p-value
            tests.sort(key = lambda i: i[1])
            
            # keeping the top 10
            l.append(tests[:10])


def export(data, outfile):
    
    # header of the output
    hdr = [
        'compound_name',
        'targets_genesymbols',
        'compound_chembl',
        'compound_kegg',
        'compound_effect',
        'targets_uniprots',
        'targets_pathways_kegg',
        'targets_pathways_signor',
        ''
    ]
    
    hdr.extend(['kegg'] * 20)
    hdr.append('')
    hdr.extend(['signor'] * 20)
    
    with open(outfile, 'w') as fp:
        
        # writing header
        fp.write('%s\n' % '\t'.join(hdr))
        # writing the table
        fp.write('\n'.join(
            '\t'.join([
                # the fields from input
                l[0], ';'.join(l[1]), '\t'.join(l[2:5]),
                # uniprot, kegg pathways and signor pathways
                ';'.join(l[5]), ';'.join(l[7]), ';'.join(l[8]), '',
                # fisher tests for top 10-10 pathways
                '\t'.join(
                    '%s\t%e' % l[9][i] if i < len(l[9]) else '\t'
                    for i in range(10)
                ), '',
                '\t'.join(
                    '%s\t%e' % l[10][i] if i < len(l[10]) else '\t'
                    for i in range(10)
                )
            ]) for l in data
        ))


def main(infile, outfile, pa = None):
    
    # initialize PyPath object
    if not pa:
        pa = pypath.PyPath()
        # this takes quite some time first because downloads
        # but also later when you load from cache it takes minutes
        pa.load_omnipath(kinase_substrate_extra = True)
        # this to load pathway annotations
        pa.load_all_pathways()
    
    # all the steps above in sequence
    # note: as data is a list of lists
    # we don't need to return it
    # as in Python lists contain references
    data = read_data(infile)
    map_names(data, pa.mapper)
    targets_lookup(data, pa)
    pathway_memberships(data, pa)
    score_pathways(data, pa)
    export(data, outfile)
    
    # this is good for keeping the pypath object
    # and the data for further manipulation
    return data, pa


if __name__ == '__main__':
    
    data, pa = main(infile, outfile)

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.