Created by Dénes Türei 2021-08-19
      #!/usr/bin/env python

#
# Denes Turei (turei.denes@gmail.com)
#

from pypath.core import network
from pypath.resources import network as netres
from pypath.share import common

# First we create a network object and load the "pathway" dataset:

n = network.Network()
n.load(netres.pathway)

# The Network object provides many methods to extract, filter and group
# information. The methods with `_by_` in the middle collect certain
# information and group them by another property. For example, to collect
# all interacting pairs grouped by literature references:

i_by_ref = n.interactions_by_reference()

# In this dictionary we find elements like this:

# {...
# <Reference: 16541104>: {
#     (<Entity: RAB8A>, <Entity: RABIF>),
#     (<Entity: RABIF>, <Entity: RAB8A>)
# },
# ...}

# meaning that the interactions between RAB8A and RABIF have been curated
# from the paper with PubMed ID 16541104. All these methods are able to
# filter, for example, to get the same kind of data but only from the
# SPIKE and SIGNOR databases:

i_by_ref_s = n.interactions_by_reference(resources = {'SPIKE', 'SIGNOR'})

# The methods prefixed with `get_` collect some elements from the interaction
# objects, optionally with filtering. For example, to collect all molecule
# labels which are in the TRIP database:

genes_in_trip = n.get_labels(resources = 'TRIP')

# Similar methods can answer which organisms or interaction types present in
# the network. This network contains only post-translational interactions
# from human (9606):

n.get_interaction_types()
# {'post_translational'}
no.get_organisms()
# {9606}

# The "nodes" dict contains pypath.core.entity.Entity objects with UniProt
# IDs as keys, the "nodes_by_label" dict is the same but Gene Symbols are
# the keys:
n.nodes['P00533']
# <Entity: EGFR>

n.nodes_by_label['EGFR']
#<Entity: EGFR>

# The "interactions" dict contains pypath.core.interaction.Interaction
# objects; the keys are tuples of node pairs. To access an element in this
# dict we can create Node objects:

from pypath.core import entity
egfr = entity.Entity('EGFR')
egf = entity.Entity('EGF')

# and use them as keys; their order is alphabetic by the identifiers:

n.interactions[(egfr, egf)]
# <Interaction: EGFR <=(+)==(+)==> EGF [Evidences: AlzPathway,
# BEL-Large-Corpus, Baccin2019, BioGRID, CancerCellMap, CellChatDB,
# CellPhoneDB, CellTalkDB, DIP, DLRP, EMBRACE, Fantom5, Guide2Pharma, HPMR,
# HPRD, ICELLNET, IntAct, KEGG, KEGG-MEDICUS, Kirouac2010, LRdb, Lit-BM-17,
# NetPath, PhosphoPoint, ProtMapper, REACH, Ramilowski2015, SIGNOR, SPIKE,
# STRING, SignaLink3, Sparser, Wang, cellsignal.com, connectomeDB2020, iTALK,
# talklr (75 references)]>

# This is done automatically by the "interaction" method, let's isolate this
# interaction to use it as an example:

i = n.interaction('EGF', 'EGFR')

# The Interaction object has more or less the same methods as the Network
# object, as the latter mostly works by calling these methods on all
# interactions and accumulating their results. Let's take a closer look
# on one of these. The `get_interactions` method returns tuples of entities,
# here two of them, because this is a mutual interaction:

i.get_interactions()
# ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>))

# All these methods accept the usual filtering parameters, such as
# `direction`, `effect`, `resources`, `references`, `interaction_type`,
# `data_model`, etc. For example, we can limit our query to certain resources:

i.get_interactions(resources = {'SignaLink3', 'SIGNOR'})
# ((<Entity: EGF>, <Entity: EGFR>),)

# In this case only one tuple is returned, because in SIGNOR and SignaLink3
# only the EGF->EGFR interaction presents. Using a `by` method, we can
# retrieve the interacting pairs by resource:

i.interactions_by_resource()
# {
# 'AlzPathway': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'Baccin2019': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'BioGRID': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'CancerCellMap': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'CellChatDB': ((<Entity: EGF>, <Entity: EGFR>),),
# 'CellPhoneDB': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'CellTalkDB': ((<Entity: EGF>, <Entity: EGFR>),),
# ...}

# As we see, CellChatDB and CellTalkDB contain only the EGF->EGFR
# interaction, while the rest of the resources also the EGFR->EGF.
# We can apply filters also in this query, for example, if we want only the
# negative interactions, an empty dict is returned, as no resource annotates
# this interaction as negative:

i.interactions_by_resource(effect = 'negative')
# {}

# If we query for positive interactions, we see that only 6 resources
# provide any information about the effect, and only "Wang" tells that the
# EGFR->EGF interaction is positive:

i.interactions_by_resource(effect = 'positive')
# {'KEGG': ((<Entity: EGF>, <Entity: EGFR>),),
# 'KEGG-MEDICUS': ((<Entity: EGF>, <Entity: EGFR>),),
# 'SIGNOR': ((<Entity: EGF>, <Entity: EGFR>),),
# 'SPIKE': ((<Entity: EGF>, <Entity: EGFR>),),
# 'SignaLink3': ((<Entity: EGF>, <Entity: EGFR>),),
# 'Wang': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>))}

# As another example, we can find out which resources a reference is coming
# from, and also which references a resource contains:

i.resource_names_by_reference()
i.references_by_resource()
# Same, but considering only evidences curating effect sign:
i.references_by_resource(effect = True)

# These examples demonstrated the design of `get_` and `_by_` methods. These
# methods extract information from data structures held by the Interaction
# object, mostly within `pypath.core.evidence.Evidence(s)` objects. An
# Evidence object always points to a resource (which carries a number of
# meta-information, such as license, URL, etc) and optionally a literature
# reference. Evidences can be accessed in bulk, or filtered and grouped:

i.evidences
# <Evidences: AlzPathway, BEL-Large-Corpus, Baccin2019, BioGRID,
# CancerCellMap, CellChatDB, CellPhoneDB, CellTalkDB, DIP, DLRP, EMBRACE,
# Fantom5, Guide2Pharma, HPMR, HPRD, ICELLNET, IntAct, KEGG, KEGG-MEDICUS,
# Kirouac2010, LRdb, Lit-BM-17, NetPath, PhosphoPoint, ProtMapper, REACH,
# Ramilowski2015, SIGNOR, SPIKE, STRING, SignaLink3, Sparser, Wang,
# cellsignal.com, connectomeDB2020, iTALK, talklr (75 references)>
i.get_evidences(effect = 'positive')
# <Evidences: KEGG, KEGG-MEDICUS, SIGNOR, SPIKE, SignaLink3,
# Wang (9 references)>
i.evidences_by_resource()
# {'AlzPathway': <Evidences: AlzPathway (1 references)>,
# 'Baccin2019': <Evidences: Baccin2019 (7 references)>,
# 'BioGRID': <Evidences: BioGRID (2 references)>,
# ...}

# The evidences are stored by directions:
i.direction
# {
#     'undirected': <Evidences: AlzPathway, BioGRID, CancerCellMap,
#         CellPhoneDB, DIP, HPRD, IntAct, Lit-BM-17, NetPath (24 references)>,
#     (<Entity: EGFR>, <Entity: EGF>): <Evidences: Baccin2019, PhosphoPoint,
#         Ramilowski2015, Wang (0 references)>,
#     (<Entity: EGF>, <Entity: EGFR>): <Evidences: BEL-Large-Corpus,
#         Baccin2019, CellChatDB, CellTalkDB, DLRP, EMBRACE, Fantom5,
#         Guide2Pharma, HPMR, HPRD, ICELLNET, KEGG, KEGG-MEDICUS,
#         Kirouac2010, LRdb, ProtMapper, REACH, Ramilowski2015, SIGNOR,
#         SPIKE, STRING, SignaLink3, Sparser, Wang, cellsignal.com,
#         connectomeDB2020, iTALK, talklr (59 references)>
# }

# Similar dictionaries store the evidences for positive and negative effect
# signs:
i.positive
# {
#     (<Entity: EGFR>, <Entity: EGF>): <Evidences: Wang (0 references)>,
#     (<Entity: EGF>, <Entity: EGFR>): <Evidences: KEGG, KEGG-MEDICUS, SIGNOR,
#         SPIKE, SignaLink3, Wang (9 references)>
# }
i.negative
# {
#     (<Entity: EGFR>, <Entity: EGF>): <Evidences: None (0 references)>,
#     (<Entity: EGF>, <Entity: EGFR>): <Evidences: None (0 references)>
# }

# To extract information more efficiently, we can use the `which_` methods:

i.which_directions()
# ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>))
i.which_signs()
# (
#     ((<Entity: EGFR>, <Entity: EGF>), 'positive'),
#     ((<Entity: EGF>, <Entity: EGFR>), 'positive')
# )

# As usual, you can filter by resources:
i.which_signs(resources = 'KEGG-MEDICUS')
# (((<Entity: EGF>, <Entity: EGFR>), 'positive'),)

# We see that most of the evidences support the EGF->EGFR interaction, and
# we might suspect that the EGFR->EGF interaction is wrong or maybe valid
# only in some special context or interpretation. We have a shortcut to get
# a majority consensus across all evidences:

i.majority_dir()
# (<Entity: EGF>, <Entity: EGFR>)
i.majority_sign()
# {
#     (<Entity: EGFR>, <Entity: EGF>): [False, False],
#     (<Entity: EGF>, <Entity: EGFR>): [True, False]
# }

# In the latter output, the `[True, False]` list represents positive and
# negative signs, i.e. only the EGF->EGFR positive sign is supported by
# the majority, all other signs and directions have a False value.

# Some resource specific extra attributes are stored in the `attrs` dict:

i.attrs
# {'cellphonedb_type': 'ligand-ligand',
# 'dip_id': 'DIP-58935E',
# 'dip_methods': ['transmission electron microscopy',
#     'surface plasmon resonance', 'electron tomography', 'molecular sieving',
#     'affinity chromatography technology', 'biochemical', 'x ray scattering',
#     'anti bait coimmunoprecipitation'],
# 'dip_type': ['direct interaction', 'physical association'],
# 'hprd_methods': ['in vitro', 'in vivo'],
# 'intact_methods': {'anti bait coimmunoprecipitation',
#     'fluorescence-activated cell sorting', 'molecular sieving',
#     'proximity ligation assay', 'saturation binding',
#     'surface plasmon resonance', 'transmission electron microscopy',
#     'x ray scattering', 'x-ray crystallography'},
# 'mentha_score': 0.999,
# 'netpath_methods': ['in vitro', 'in vivo'],
# 'netpath_pathways': ['Epidermal growth factor receptor (EGFR)'],
# 'netpath_type': ['physical interaction'],
# 'phosphopoint_category': 'Category 1',
# 'ramilowski_sources': ['DLRP', 'HPMR', 'IUPHAR', 'HPRD', 'STRING.binding',
#     'STRING.experiment', 'literature supported'],
# 'signor_mechanism': ['binding'],
# 'spike_effect': '1',
# 'spike_mechanism': 'Physical Interaction'}

# To get a simple yes/no answer whether an interaction is directed and has
# effect sign:

i.is_directed()
# True
i.has_sign()
# True
i.is_stimulation()
# True
i.is_inhibition()
# False
i.is_mutual()
# True

# The Network and the Interaction objects are able to yield data frame rows,
# which are named tuples which if bound together form a data frame.

i.generate_df_records()
# [
#     InteractionDataFrameRecord(
#         id_a='P00533', id_b='P01133', type_a='protein', type_b='protein',
#         directed=True, effect=1, type='post_translational',
#         dmodel={'activity_flow', 'ligand_receptor', 'interaction',
#         'enzyme_substrate'}, sources={'Wang'}, references=None),
#     ...
# ]

# The `get_df` method of the Network object returns the actual data frame:

df = n.get_df()

# Each interaction has two molecule parters called `a` and `b`, ordered
# alphabetically:

i.a
# <Entity: EGFR>
i.b
# <Entity: EGF>

# The two possible directions:
i.a_b
# (<Entity: EGFR>, <Entity: EGF>)
i.b_a
# (<Entity: EGF>, <Entity: EGFR>)

# The Entity objects carry some essential information:

i.a.identifier
# 'P00533'
i.a.id_type
# 'uniprot'
i.a.entity_type
# 'protein'
i.a.label
# 'EGFR'
i.a.taxon
# 9606

# The protein type Entity objects are able to print comprehensive
# information from UniProt as a table:

i.a.info()
#=====> [1 proteins] <=====
#╒═══════╤═══════╤═════════╤═════════╤═════════╤═════════╤═════════╤═════════╤═════════╕
#│   No. │ ac    │ genes   │   lengt │ weigh   │ full_   │ funct   │ keywo   │ subce   │
#│       │       │ ymbol   │       h │ t       │ name    │ ion_o   │ rds     │ llula   │
#│       │       │         │         │         │         │ r_gen   │         │ r_loc   │
#│       │       │         │         │         │         │ ecard   │         │ ation   │
#│       │       │         │         │         │         │ s       │         │         │
#╞═══════╪═══════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
#│     1 │ P0053 │ EGFR    │    1210 │ 13427   │ Epide   │ Recep   │ 3D-st   │ Cell    │
#│       │ 3     │         │         │ 7       │ rmal    │ tor t   │ ructu   │ membr   │
#│       │       │         │         │         │ growt   │ yrosi   │ re, A   │ ane;    │
#│       │       │         │         │         │ h fac   │ ne ki   │ ltern   │ Singl   │
#│       │       │         │         │         │ tor r   │ nase    │ ative   │ e-      │
#│       │       │         │         │         │ ecept   │ bindi   │ splic   │ pass    │
#│       │       │         │         │         │ or      │ ng li   │ ing,    │ type    │
#│       │       │         │         │         │         │ gands   │ ATP-b   │ I mem   │
#│       │       │         │         │         │         │ of      │ indin   │ brane   │
#│       │       │         │         │         │         │ the     │ g,      │ prote   │
#│       │       │         │         │         │         │ EGF f   │ Cell    │ in. E   │
#│       │       │         │         │         │         │ amily   │ membr   │ ndopl   │
#│       │       │         │         │         │         │ and a   │ ane,    │ asmic   │

# Now let's look into the Evidence objects. First we look at an Evidences
# object, which is a collection of many evidences:

evs = i.evidences

# The `match` method of evidences returns bool, while the `filter` method
# a filtered list of Evidence objects. Is there any evidence in this
# collection with activity flow data model?

evs.match(data_model = 'activity_flow')
# True

# Get those evidences as a list:

list(evs.filter(data_model = 'activity_flow'))
# [<Evidence SPIKE (1 references)>, <Evidence SignaLink3 (5 references)>,
#  <Evidence Guide2Pharma (0 references)>, ...]

# Both the Network, the Interaction and the Evidences objects have `count_`
# methods. For example, how many unique references are available in this
# evidence collection?

evs.count_references()
# 75

# We can also retrieve the list of those references:

evs.get_references()
# {<Reference: 24709886>, <Reference: 27564112>, <Reference: 15958209>, ...}

# The Reference objects carry the PubMed IDs:
r = common.first(evs.get_references())
r.pmid
# '25453753'

# They are able to fetch detailed information from PubMed:
r.info()
# {
#     '25453753': {
#         'elocationid': 'doi: 10.1016/j.celrep.2014.10.010',
#         'epubdate': '2014 Nov 6',
#         'essn': '2211-1247',
#         'fulljournalname': 'Cell reports',
#         'issue': '4',
#         'lang': ['eng'],
#         'lastauthor': 'Lemmon MA',
#         'pages': '1306-17',
#         'pmcrefcount': 29,
#         'pubdate': '2014 Nov 20',
#         'volume': '9'
#         ...
#     },
# }

# Or open the article in a browser by `r.open()`.

# The resources are available as Resource objects, or as simple strings.

evs.get_resources()
# {<NetworkResource: BEL-Large-Corpus (post_translational, enzyme_substrate)>,
#  <NetworkResource: HPMR (post_translational, ligand_receptor)>,
# ...}
evs.get_resource_names()
# {'AlzPathway', 'Baccin2019', 'BioGRID', 'CancerCellMap', 'CellChatDB', ...}

# Certain resources integrate data from other resources, so the data comes
# from the original resource via another resource. To list such indirectly
# obtained resources:

evs.get_resource_names(via = True)
# {'Fantom5', 'STRING', 'Guide2Pharma', 'HPRD', ...}

# Or only the ones obtained via ProtMapper:
evs.get_resource_names(via = 'ProtMapper')
# {'BEL-Large-Corpus', 'Sparser', 'REACH'}

# It is possible to list the resources as tuples, each having the original
# resource as first element and the mediating resource as second:
evs.get_resource_names_via(via = None)
# {('AlzPathway', None), ('BEL-Large-Corpus', 'ProtMapper'),
#  ('BioGRID', None), ('CancerCellMap', None), ('CellChatDB', None),
#  ('DLRP', 'talklr'), ('EMBRACE', None), ('Fantom5', 'LRdb'), ...}

# The individual evidences are stored in the `evidences` attribute. Let's
# take a closer look on one Evidence object:

ev = common.first(evs.evidences.values())
ev
# <Evidence SPIKE (1 references)>

ev.resource
# <NetworkResource: SPIKE (post_translational, activity_flow)>
ev.references
# {<Reference: 20458382>}

# The Network, Interaction and Evidence(s) objects use the `contains`
# operator to check for the presence of resources, molecules or reference,
# and Evidence objects are equal to the name of their resource:

'EGF' in n
# True
'SIGNOR' in i
# True
'EGFR' in i
# True
'20458382' in i # a PubMed ID
# True
'SPIKE' in evs
# True
'20458382' in evs # a PubMed ID
# True
ev == 'SPIKE'
# True

# Each Evidence object has a Resource object, now we take a closer look on
# this:

res = ev.resource

# Some essential attributes are the followings:

res.name
# 'SPIKE'
res.data_type
# 'network'
res.interaction_type
# 'post_translational'
res.data_model
# 'activity_flow'
res.via
# None

# The `via` attribute carries the mediator resource name, pypath got the
# data from this resource, but the original resource supposed to be the one
# in the `name` attribute.

# How the resource has been loaded, is defined in a NetworkInput object.
# For example, to find the function or file which provided the raw data:

res.networkinput.input
# 'spike.spike_interactions'

# And to find out in this raw data which column contained the references:

res.networkinput.refs
# (5, ';')

# Or what was the identifier type in the raw data:

res.networkinput.id_type_a
# 'genesymbol'

    
Comments (0)

HTTPS
SSH
You can clone a snippet to your computer for local editing. Learn more.
Snippets

Dénes Türei Introduction to the molecular network API of pypath

Comments (0)