Snippets

Dénes Türei Introduction to the molecular network API of pypath

Created by Dénes Türei
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
#!/usr/bin/env python

#
# Denes Turei (turei.denes@gmail.com)
#

from pypath.core import network
from pypath.resources import network as netres
from pypath.share import common

# First we create a network object and load the "pathway" dataset:

n = network.Network()
n.load(netres.pathway)

# The Network object provides many methods to extract, filter and group
# information. The methods with `_by_` in the middle collect certain
# information and group them by another property. For example, to collect
# all interacting pairs grouped by literature references:

i_by_ref = n.interactions_by_reference()

# In this dictionary we find elements like this:

# {...
# <Reference: 16541104>: {
#     (<Entity: RAB8A>, <Entity: RABIF>),
#     (<Entity: RABIF>, <Entity: RAB8A>)
# },
# ...}

# meaning that the interactions between RAB8A and RABIF have been curated
# from the paper with PubMed ID 16541104. All these methods are able to
# filter, for example, to get the same kind of data but only from the
# SPIKE and SIGNOR databases:

i_by_ref_s = n.interactions_by_reference(resources = {'SPIKE', 'SIGNOR'})

# The methods prefixed with `get_` collect some elements from the interaction
# objects, optionally with filtering. For example, to collect all molecule
# labels which are in the TRIP database:

genes_in_trip = n.get_labels(resources = 'TRIP')

# Similar methods can answer which organisms or interaction types present in
# the network. This network contains only post-translational interactions
# from human (9606):

n.get_interaction_types()
# {'post_translational'}
no.get_organisms()
# {9606}

# The "nodes" dict contains pypath.core.entity.Entity objects with UniProt
# IDs as keys, the "nodes_by_label" dict is the same but Gene Symbols are
# the keys:
n.nodes['P00533']
# <Entity: EGFR>

n.nodes_by_label['EGFR']
#<Entity: EGFR>

# The "interactions" dict contains pypath.core.interaction.Interaction
# objects; the keys are tuples of node pairs. To access an element in this
# dict we can create Node objects:

from pypath.core import entity
egfr = entity.Entity('EGFR')
egf = entity.Entity('EGF')

# and use them as keys; their order is alphabetic by the identifiers:

n.interactions[(egfr, egf)]
# <Interaction: EGFR <=(+)==(+)==> EGF [Evidences: AlzPathway,
# BEL-Large-Corpus, Baccin2019, BioGRID, CancerCellMap, CellChatDB,
# CellPhoneDB, CellTalkDB, DIP, DLRP, EMBRACE, Fantom5, Guide2Pharma, HPMR,
# HPRD, ICELLNET, IntAct, KEGG, KEGG-MEDICUS, Kirouac2010, LRdb, Lit-BM-17,
# NetPath, PhosphoPoint, ProtMapper, REACH, Ramilowski2015, SIGNOR, SPIKE,
# STRING, SignaLink3, Sparser, Wang, cellsignal.com, connectomeDB2020, iTALK,
# talklr (75 references)]>

# This is done automatically by the "interaction" method, let's isolate this
# interaction to use it as an example:

i = n.interaction('EGF', 'EGFR')

# The Interaction object has more or less the same methods as the Network
# object, as the latter mostly works by calling these methods on all
# interactions and accumulating their results. Let's take a closer look
# on one of these. The `get_interactions` method returns tuples of entities,
# here two of them, because this is a mutual interaction:

i.get_interactions()
# ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>))

# All these methods accept the usual filtering parameters, such as
# `direction`, `effect`, `resources`, `references`, `interaction_type`,
# `data_model`, etc. For example, we can limit our query to certain resources:

i.get_interactions(resources = {'SignaLink3', 'SIGNOR'})
# ((<Entity: EGF>, <Entity: EGFR>),)

# In this case only one tuple is returned, because in SIGNOR and SignaLink3
# only the EGF->EGFR interaction presents. Using a `by` method, we can
# retrieve the interacting pairs by resource:

i.interactions_by_resource()
# {
# 'AlzPathway': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'Baccin2019': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'BioGRID': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'CancerCellMap': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'CellChatDB': ((<Entity: EGF>, <Entity: EGFR>),),
# 'CellPhoneDB': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>)),
# 'CellTalkDB': ((<Entity: EGF>, <Entity: EGFR>),),
# ...}

# As we see, CellChatDB and CellTalkDB contain only the EGF->EGFR
# interaction, while the rest of the resources also the EGFR->EGF.
# We can apply filters also in this query, for example, if we want only the
# negative interactions, an empty dict is returned, as no resource annotates
# this interaction as negative:

i.interactions_by_resource(effect = 'negative')
# {}

# If we query for positive interactions, we see that only 6 resources
# provide any information about the effect, and only "Wang" tells that the
# EGFR->EGF interaction is positive:

i.interactions_by_resource(effect = 'positive')
# {'KEGG': ((<Entity: EGF>, <Entity: EGFR>),),
# 'KEGG-MEDICUS': ((<Entity: EGF>, <Entity: EGFR>),),
# 'SIGNOR': ((<Entity: EGF>, <Entity: EGFR>),),
# 'SPIKE': ((<Entity: EGF>, <Entity: EGFR>),),
# 'SignaLink3': ((<Entity: EGF>, <Entity: EGFR>),),
# 'Wang': ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>))}

# As another example, we can find out which resources a reference is coming
# from, and also which references a resource contains:

i.resource_names_by_reference()
i.references_by_resource()
# Same, but considering only evidences curating effect sign:
i.references_by_resource(effect = True)

# These examples demonstrated the design of `get_` and `_by_` methods. These
# methods extract information from data structures held by the Interaction
# object, mostly within `pypath.core.evidence.Evidence(s)` objects. An
# Evidence object always points to a resource (which carries a number of
# meta-information, such as license, URL, etc) and optionally a literature
# reference. Evidences can be accessed in bulk, or filtered and grouped:

i.evidences
# <Evidences: AlzPathway, BEL-Large-Corpus, Baccin2019, BioGRID,
# CancerCellMap, CellChatDB, CellPhoneDB, CellTalkDB, DIP, DLRP, EMBRACE,
# Fantom5, Guide2Pharma, HPMR, HPRD, ICELLNET, IntAct, KEGG, KEGG-MEDICUS,
# Kirouac2010, LRdb, Lit-BM-17, NetPath, PhosphoPoint, ProtMapper, REACH,
# Ramilowski2015, SIGNOR, SPIKE, STRING, SignaLink3, Sparser, Wang,
# cellsignal.com, connectomeDB2020, iTALK, talklr (75 references)>
i.get_evidences(effect = 'positive')
# <Evidences: KEGG, KEGG-MEDICUS, SIGNOR, SPIKE, SignaLink3,
# Wang (9 references)>
i.evidences_by_resource()
# {'AlzPathway': <Evidences: AlzPathway (1 references)>,
# 'Baccin2019': <Evidences: Baccin2019 (7 references)>,
# 'BioGRID': <Evidences: BioGRID (2 references)>,
# ...}

# The evidences are stored by directions:
i.direction
# {
#     'undirected': <Evidences: AlzPathway, BioGRID, CancerCellMap,
#         CellPhoneDB, DIP, HPRD, IntAct, Lit-BM-17, NetPath (24 references)>,
#     (<Entity: EGFR>, <Entity: EGF>): <Evidences: Baccin2019, PhosphoPoint,
#         Ramilowski2015, Wang (0 references)>,
#     (<Entity: EGF>, <Entity: EGFR>): <Evidences: BEL-Large-Corpus,
#         Baccin2019, CellChatDB, CellTalkDB, DLRP, EMBRACE, Fantom5,
#         Guide2Pharma, HPMR, HPRD, ICELLNET, KEGG, KEGG-MEDICUS,
#         Kirouac2010, LRdb, ProtMapper, REACH, Ramilowski2015, SIGNOR,
#         SPIKE, STRING, SignaLink3, Sparser, Wang, cellsignal.com,
#         connectomeDB2020, iTALK, talklr (59 references)>
# }

# Similar dictionaries store the evidences for positive and negative effect
# signs:
i.positive
# {
#     (<Entity: EGFR>, <Entity: EGF>): <Evidences: Wang (0 references)>,
#     (<Entity: EGF>, <Entity: EGFR>): <Evidences: KEGG, KEGG-MEDICUS, SIGNOR,
#         SPIKE, SignaLink3, Wang (9 references)>
# }
i.negative
# {
#     (<Entity: EGFR>, <Entity: EGF>): <Evidences: None (0 references)>,
#     (<Entity: EGF>, <Entity: EGFR>): <Evidences: None (0 references)>
# }

# To extract information more efficiently, we can use the `which_` methods:

i.which_directions()
# ((<Entity: EGFR>, <Entity: EGF>), (<Entity: EGF>, <Entity: EGFR>))
i.which_signs()
# (
#     ((<Entity: EGFR>, <Entity: EGF>), 'positive'),
#     ((<Entity: EGF>, <Entity: EGFR>), 'positive')
# )

# As usual, you can filter by resources:
i.which_signs(resources = 'KEGG-MEDICUS')
# (((<Entity: EGF>, <Entity: EGFR>), 'positive'),)

# We see that most of the evidences support the EGF->EGFR interaction, and
# we might suspect that the EGFR->EGF interaction is wrong or maybe valid
# only in some special context or interpretation. We have a shortcut to get
# a majority consensus across all evidences:

i.majority_dir()
# (<Entity: EGF>, <Entity: EGFR>)
i.majority_sign()
# {
#     (<Entity: EGFR>, <Entity: EGF>): [False, False],
#     (<Entity: EGF>, <Entity: EGFR>): [True, False]
# }

# In the latter output, the `[True, False]` list represents positive and
# negative signs, i.e. only the EGF->EGFR positive sign is supported by
# the majority, all other signs and directions have a False value.

# Some resource specific extra attributes are stored in the `attrs` dict:

i.attrs
# {'cellphonedb_type': 'ligand-ligand',
# 'dip_id': 'DIP-58935E',
# 'dip_methods': ['transmission electron microscopy',
#     'surface plasmon resonance', 'electron tomography', 'molecular sieving',
#     'affinity chromatography technology', 'biochemical', 'x ray scattering',
#     'anti bait coimmunoprecipitation'],
# 'dip_type': ['direct interaction', 'physical association'],
# 'hprd_methods': ['in vitro', 'in vivo'],
# 'intact_methods': {'anti bait coimmunoprecipitation',
#     'fluorescence-activated cell sorting', 'molecular sieving',
#     'proximity ligation assay', 'saturation binding',
#     'surface plasmon resonance', 'transmission electron microscopy',
#     'x ray scattering', 'x-ray crystallography'},
# 'mentha_score': 0.999,
# 'netpath_methods': ['in vitro', 'in vivo'],
# 'netpath_pathways': ['Epidermal growth factor receptor (EGFR)'],
# 'netpath_type': ['physical interaction'],
# 'phosphopoint_category': 'Category 1',
# 'ramilowski_sources': ['DLRP', 'HPMR', 'IUPHAR', 'HPRD', 'STRING.binding',
#     'STRING.experiment', 'literature supported'],
# 'signor_mechanism': ['binding'],
# 'spike_effect': '1',
# 'spike_mechanism': 'Physical Interaction'}

# To get a simple yes/no answer whether an interaction is directed and has
# effect sign:

i.is_directed()
# True
i.has_sign()
# True
i.is_stimulation()
# True
i.is_inhibition()
# False
i.is_mutual()
# True

# The Network and the Interaction objects are able to yield data frame rows,
# which are named tuples which if bound together form a data frame.

i.generate_df_records()
# [
#     InteractionDataFrameRecord(
#         id_a='P00533', id_b='P01133', type_a='protein', type_b='protein',
#         directed=True, effect=1, type='post_translational',
#         dmodel={'activity_flow', 'ligand_receptor', 'interaction',
#         'enzyme_substrate'}, sources={'Wang'}, references=None),
#     ...
# ]

# The `get_df` method of the Network object returns the actual data frame:

df = n.get_df()

# Each interaction has two molecule parters called `a` and `b`, ordered
# alphabetically:

i.a
# <Entity: EGFR>
i.b
# <Entity: EGF>

# The two possible directions:
i.a_b
# (<Entity: EGFR>, <Entity: EGF>)
i.b_a
# (<Entity: EGF>, <Entity: EGFR>)

# The Entity objects carry some essential information:

i.a.identifier
# 'P00533'
i.a.id_type
# 'uniprot'
i.a.entity_type
# 'protein'
i.a.label
# 'EGFR'
i.a.taxon
# 9606

# The protein type Entity objects are able to print comprehensive
# information from UniProt as a table:

i.a.info()
#=====> [1 proteins] <=====
#╒═══════╤═══════╤═════════╤═════════╤═════════╤═════════╤═════════╤═════════╤═════════╕
#│   No. │ ac    │ genes   │   lengt │ weigh   │ full_   │ funct   │ keywo   │ subce   │
#│       │       │ ymbol   │       h │ t       │ name    │ ion_o   │ rds     │ llula   │
#│       │       │         │         │         │         │ r_gen   │         │ r_loc   │
#│       │       │         │         │         │         │ ecard   │         │ ation   │
#│       │       │         │         │         │         │ s       │         │         │
#╞═══════╪═══════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
#│     1 │ P0053 │ EGFR    │    1210 │ 13427   │ Epide   │ Recep   │ 3D-st   │ Cell    │
#│       │ 3     │         │         │ 7       │ rmal    │ tor t   │ ructu   │ membr   │
#│       │       │         │         │         │ growt   │ yrosi   │ re, A   │ ane;    │
#│       │       │         │         │         │ h fac   │ ne ki   │ ltern   │ Singl   │
#│       │       │         │         │         │ tor r   │ nase    │ ative   │ e-      │
#│       │       │         │         │         │ ecept   │ bindi   │ splic   │ pass    │
#│       │       │         │         │         │ or      │ ng li   │ ing,    │ type    │
#│       │       │         │         │         │         │ gands   │ ATP-b   │ I mem   │
#│       │       │         │         │         │         │ of      │ indin   │ brane   │
#│       │       │         │         │         │         │ the     │ g,      │ prote   │
#│       │       │         │         │         │         │ EGF f   │ Cell    │ in. E   │
#│       │       │         │         │         │         │ amily   │ membr   │ ndopl   │
#│       │       │         │         │         │         │ and a   │ ane,    │ asmic   │

# Now let's look into the Evidence objects. First we look at an Evidences
# object, which is a collection of many evidences:

evs = i.evidences

# The `match` method of evidences returns bool, while the `filter` method
# a filtered list of Evidence objects. Is there any evidence in this
# collection with activity flow data model?

evs.match(data_model = 'activity_flow')
# True

# Get those evidences as a list:

list(evs.filter(data_model = 'activity_flow'))
# [<Evidence SPIKE (1 references)>, <Evidence SignaLink3 (5 references)>,
#  <Evidence Guide2Pharma (0 references)>, ...]

# Both the Network, the Interaction and the Evidences objects have `count_`
# methods. For example, how many unique references are available in this
# evidence collection?

evs.count_references()
# 75

# We can also retrieve the list of those references:

evs.get_references()
# {<Reference: 24709886>, <Reference: 27564112>, <Reference: 15958209>, ...}

# The Reference objects carry the PubMed IDs:
r = common.first(evs.get_references())
r.pmid
# '25453753'

# They are able to fetch detailed information from PubMed:
r.info()
# {
#     '25453753': {
#         'elocationid': 'doi: 10.1016/j.celrep.2014.10.010',
#         'epubdate': '2014 Nov 6',
#         'essn': '2211-1247',
#         'fulljournalname': 'Cell reports',
#         'issue': '4',
#         'lang': ['eng'],
#         'lastauthor': 'Lemmon MA',
#         'pages': '1306-17',
#         'pmcrefcount': 29,
#         'pubdate': '2014 Nov 20',
#         'volume': '9'
#         ...
#     },
# }

# Or open the article in a browser by `r.open()`.

# The resources are available as Resource objects, or as simple strings.

evs.get_resources()
# {<NetworkResource: BEL-Large-Corpus (post_translational, enzyme_substrate)>,
#  <NetworkResource: HPMR (post_translational, ligand_receptor)>,
# ...}
evs.get_resource_names()
# {'AlzPathway', 'Baccin2019', 'BioGRID', 'CancerCellMap', 'CellChatDB', ...}

# Certain resources integrate data from other resources, so the data comes
# from the original resource via another resource. To list such indirectly
# obtained resources:

evs.get_resource_names(via = True)
# {'Fantom5', 'STRING', 'Guide2Pharma', 'HPRD', ...}

# Or only the ones obtained via ProtMapper:
evs.get_resource_names(via = 'ProtMapper')
# {'BEL-Large-Corpus', 'Sparser', 'REACH'}

# It is possible to list the resources as tuples, each having the original
# resource as first element and the mediating resource as second:
evs.get_resource_names_via(via = None)
# {('AlzPathway', None), ('BEL-Large-Corpus', 'ProtMapper'),
#  ('BioGRID', None), ('CancerCellMap', None), ('CellChatDB', None),
#  ('DLRP', 'talklr'), ('EMBRACE', None), ('Fantom5', 'LRdb'), ...}

# The individual evidences are stored in the `evidences` attribute. Let's
# take a closer look on one Evidence object:

ev = common.first(evs.evidences.values())
ev
# <Evidence SPIKE (1 references)>

ev.resource
# <NetworkResource: SPIKE (post_translational, activity_flow)>
ev.references
# {<Reference: 20458382>}

# The Network, Interaction and Evidence(s) objects use the `contains`
# operator to check for the presence of resources, molecules or reference,
# and Evidence objects are equal to the name of their resource:

'EGF' in n
# True
'SIGNOR' in i
# True
'EGFR' in i
# True
'20458382' in i # a PubMed ID
# True
'SPIKE' in evs
# True
'20458382' in evs # a PubMed ID
# True
ev == 'SPIKE'
# True

# Each Evidence object has a Resource object, now we take a closer look on
# this:

res = ev.resource

# Some essential attributes are the followings:

res.name
# 'SPIKE'
res.data_type
# 'network'
res.interaction_type
# 'post_translational'
res.data_model
# 'activity_flow'
res.via
# None

# The `via` attribute carries the mediator resource name, pypath got the
# data from this resource, but the original resource supposed to be the one
# in the `name` attribute.

# How the resource has been loaded, is defined in a NetworkInput object.
# For example, to find the function or file which provided the raw data:

res.networkinput.input
# 'spike.spike_interactions'

# And to find out in this raw data which column contained the references:

res.networkinput.refs
# (5, ';')

# Or what was the identifier type in the raw data:

res.networkinput.id_type_a
# 'genesymbol'

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.