Snippets

Dénes Türei Translate CAS IDs to PubChem CID

Created by Dénes Türei
#!/usr/bin/env python

# Dénes Türei, Uniklinik RWTH Aachen & EMBL Heidelberg 2017
# turei.denes@gmail.com

import urllib
import re
import imp
import time

class CasPubChemCID(object):
    
    resturl = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/cids/txt'
    recas = re.compile(r'.*\[([\-0-9]*)\].*')
    
    def __init__(self,
                 infile = 'cmap_name_and_catalog_name_from.cmap.txt'):
        
        self.infile  = infile
        self.outfile = 'pubchems__%s' % infile
    
    def reload(self):
        modname = self.__class__.__module__
        mod = __import__(modname, fromlist = [modname.split('.')[0]])
        imp.reload(mod)
        new = getattr(mod, self.__class__.__name__)
        setattr(self, '__class__', new)
    
    def main(self):
        
        self.read_infile()
        self.rest_requests()
        self.write_result()
    
    def read_infile(self):
        
        with open(self.infile, 'r') as fp:
            
            self.names1, self.names2 = zip(*[
                ln.split('[')[0].strip().split('\t')
                for ln in fp
            ])
            
            fp.seek(0)
            
            self.cas_list = [
                m.group(1) if m else '' for m in
                (self.recas.match(ln) for ln in fp)
            ]
    
    def rest_requests(self):
        
        self.cas2cid = {}
        
        for cas in self.cas_list:
            
            time.sleep(0.18)
            
            try:
                cid = urllib.request.urlopen(self.resturl % cas).read()
                
            except urllib.error.HTTPError:
                self.cas2cid[cas] = []
                continue
            
            self.cas2cid[cas] = cid.strip().decode('ascii').split('\n')
    
    def write_result(self):
        
        with open(self.outfile, 'w') as fp:
            
            fp.write(
                '\n'.join(
                    '\t'.join((
                        name1,
                        name2,
                        cas,
                        ';'.join(self.cas2cid[cas])
                    ))
                    for name1, name2, cas in zip(
                        self.names1,
                        self.names2,
                        self.cas_list
                    )
                )
            )

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.