Created by
Dénes Türei
| #!/usr/bin/env python
# Dénes Türei, Uniklinik RWTH Aachen & EMBL Heidelberg 2017
# turei.denes@gmail.com
import urllib
import re
import imp
import time
class CasPubChemCID(object):
resturl = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/cids/txt'
recas = re.compile(r'.*\[([\-0-9]*)\].*')
def __init__(self,
infile = 'cmap_name_and_catalog_name_from.cmap.txt'):
self.infile = infile
self.outfile = 'pubchems__%s' % infile
def reload(self):
modname = self.__class__.__module__
mod = __import__(modname, fromlist = [modname.split('.')[0]])
imp.reload(mod)
new = getattr(mod, self.__class__.__name__)
setattr(self, '__class__', new)
def main(self):
self.read_infile()
self.rest_requests()
self.write_result()
def read_infile(self):
with open(self.infile, 'r') as fp:
self.names1, self.names2 = zip(*[
ln.split('[')[0].strip().split('\t')
for ln in fp
])
fp.seek(0)
self.cas_list = [
m.group(1) if m else '' for m in
(self.recas.match(ln) for ln in fp)
]
def rest_requests(self):
self.cas2cid = {}
for cas in self.cas_list:
time.sleep(0.18)
try:
cid = urllib.request.urlopen(self.resturl % cas).read()
except urllib.error.HTTPError:
self.cas2cid[cas] = []
continue
self.cas2cid[cas] = cid.strip().decode('ascii').split('\n')
def write_result(self):
with open(self.outfile, 'w') as fp:
fp.write(
'\n'.join(
'\t'.join((
name1,
name2,
cas,
';'.join(self.cas2cid[cas])
))
for name1, name2, cas in zip(
self.names1,
self.names2,
self.cas_list
)
)
)
|