Commits

Andrew Dalke  committed c5392c3

Tool to extract and clean up SMILES from the BindingDB data sets.

  • Participants
  • Parent commits 8f6826b

Comments (0)

Files changed (1)

File BindingDB_exact/bindingdb_cleanup.py

+# Clean up the SMILES in the BindingDB data sets
+
+from __future__ import print_function
+
+import sys
+import re
+import argparse
+import subprocess
+
+
+parser = argparse.ArgumentParser(description="Extract, clean up, and validate BindingDB SMILES")
+
+parser.add_argument("filename", metavar="FILENAME", nargs=1,
+                    help="BindingDB filename containing SMILES")
+parser.add_argument('--opensmiles', action="store_true", default=None,
+                    help="Validate SMILES using smiles_counts from the opensmiles-ragel project")
+parser.add_argument('--smiles_counts-path', default=None,
+                    help="Path to the smiles_counts executable")
+parser.add_argument('--oechem', action="store_true", default=None,
+                    help="Validate SMILES using OpenEye's OEChem toolkit")
+parser.add_argument('--rdkit', action="store_true", default=None,
+                    help="Validate SMILES using the RDKit toolkit")
+parser.add_argument('--openbabel', action="store_true", default=None,
+                    help="Validate SMILES using the OpenBabel toolkit")
+parser.add_argument("--output", default=None,
+                    help="output filename (default is stdout)")
+parser.add_argument("--failed", action="store_true",
+                    help="Report which tests failed")
+parser.add_argument("--quiet", action="store_true",
+                    help="Disable toolkit warnings")
+
+
+
+def open_filename(filename):
+    if filename.endswith(".bz2"):
+        # Open the bz2 compressed file.
+        try:
+            # First try the Python bzip extension module
+            import bz2
+        except ImportError:
+            # If that doesn't work, try the command-line bzcat tool
+            import subprocess
+            p = subprocess.Popen(["bzcat", filename],
+                                 stdout = subprocess.PIPE)
+            return p.stdout
+        else:
+            return bz2.BZ2File(filename)
+    else:
+        return open(filename)
+
+
+###########
+# Helper function to convert things like '[C ]' to '[C+]'
+# These likely came from a one too many/too few URL encoding translations
+
+element_symbols_pattern = \
+  r"C[laroudsemf]?|Os?|N[eaibdpos]?|S[icernbmg]?|P[drmtboau]?|"  \
+  r"H[eofgas]?|c|n|o|s|p|A[lrsgutcm]|B[eraik]?|Dy|E[urs]|F[erm]?|"  \
+  r"G[aed]|I[nr]?|Kr?|L[iaur]|M[gnodt]|R[buhenaf]|T[icebmalh]|" \
+  r"U|V|W|Xe|Yb?|Z[nr]"
+
+_plus_insert = re.compile(r"(?P<a>(%s)(H[0-9]*)?@*) (?P<b>[0-9]*\])" % (element_symbols_pattern,))
+_double_plus_insert = re.compile(r"(?P<a>(%s)(H[0-9]*)?@*)  (?P<b>])" % (element_symbols_pattern,))
+_incomplete_bracket = re.compile(r"(([ONn]|H\d?)@?\+|[O]-) \]")
+def restore_pluses(line):
+    line = _plus_insert.sub(r"\g<a>+\g<b>", line)
+    line = _double_plus_insert.sub(r"\g<a>++\g<b>", line)
+    if " ]" in line:
+        # Strangely, similar.dat.bz2 has entries like:
+        #  NC(=[NH2+])c1ccc2nc(Cc3[nH]c4cc(ccc4[nH+ ]3)C(N)=[NH2+])[nH]c2c1
+        #  c1nc(c2c(n1)n(cn2)C3C(C(C(O3)COP(=O)([O- ])OCC(CCC(=O)[O-])[NH3+])O)O)N
+        # I can't figure out why the ' ]' is there, nor what they should be.
+        if _incomplete_bracket.search(line):
+            pass
+        elif "dichloro" in line:
+            pass
+        else:
+            raise AssertionError(line)
+    if " 2]" in line or " 3]" in line:
+            raise AssertionError(line)
+    return line
+
+############## Some quick checks without any toolkit
+
+# Some of the patterns use an 'H' which is not in []s
+# Examples:
+#   OC(Nc1nnHc2cc(ccc12)-c3cccs3)C4CC4
+#   Nc1cccc(CN2C@H(Cc3ccccc3)C@H(O)C@@H(CCc4ccccc4)NC2O)c1
+# This is a SMARTS variant, which I don't 
+
+
+############## Use the ragel-opensmiles code as a fast pre-filter
+
+def is_valid_opensmiles(smiles):
+    return True
+
+_opensmiles = None
+def enable_opensmiles(path=None):
+    if path is None:
+        path = "smiles_counts"
+        
+    global _opensmiles, is_valid_opensmiles
+    try:
+        _opensmiles = subprocess.Popen([path],
+                                       stdin = subprocess.PIPE,
+                                       stdout = subprocess.PIPE)
+    except (IOError, OSError) as err:
+        raise SystemExit(
+            "Unable to start the 'smiles_counts' excutable using --smiles_counts-path = %r\n%s\n" %
+            (path, err))
+    is_valid_opensmiles = real_is_valid_opensmiles
+    assert is_valid_opensmiles("C")
+    assert not is_valid_opensmiles("Q")
+
+def real_is_valid_opensmiles(smiles):
+    _opensmiles.stdin.write(smiles + "\n")
+    _opensmiles.stdin.flush()
+    result = _opensmiles.stdout.readline()
+    return "-1" not in result
+
+############## Check that OEChem accepts the SMILES
+
+def is_valid_oechem(smiles):
+    return True
+
+_oemol = None
+OEParseSmiles = None
+def enable_oechem(disable_messages):
+    global _oemol, OEParseSmiles, is_valid_oechem
+    from openeye.oechem import OEGraphMol, OEParseSmiles
+    _oemol = OEGraphMol()
+    is_valid_oechem = strict_is_valid_oechem
+    if disable_messages:
+        from openeye.oechem import OEThrow, oenul
+        OEThrow.SetOutputStream(oenul)
+    
+
+_oechem_should_not_accept_these = set((
+    r"C\\C(O)=C(/C#N)C(=O)Nc1ccc(cc1)C(F)(F)F",              # in exact.dat.bz2
+    r"CC(C)CCC[C@@H](C)CCC[C@@H](C)CCCC(/C)=C/",             # in similar.dat.bz2
+    r"Brc1cccc(c1)Nc3ncnc2c3cc(cc2)NC(=O)/C=C/",             # in similar.dat.bz2
+    r"O(c1ccc(cc1)/C(c2ccc(O)cc2)=C(\\c3ccccc3)CC)CCN(C)C",  # in similar.dat.bz2
+    ))
+
+def strict_is_valid_oechem(smiles):
+    _oemol.Clear()
+    # Process in strict mode. Otherwise some very strange SMILES get through.
+    # (Like 'ttttt', which OEChem accepts as a set of tritium atoms.)
+    is_valid = OEParseSmiles(_oemol, smiles, False, True)
+    if is_valid:
+        # Possibly override the decision
+        if ">" in smiles:
+            # Ignore reactions
+            return False
+
+        if smiles in _oechem_should_not_accept_these:
+            return False
+
+    return is_valid
+
+# Various and incomplete ways that 'H' can be used as an atom
+_H_as_atom = re.compile(r"[1-9]nH|H[()]|cnH|nHc|nHn|[()]NH|[()]nH|nnH|CNH|^H")
+
+def lenient_is_valid_oechem(smiles):
+    _oemol.Clear()
+    # Process in strict mode. Otherwise some very strange SMILES get through.
+    # (Like 'ttttt', which OEChem accepts as a set of tritium atoms.)
+    is_valid = OEParseSmiles(_oemol, smiles, False, True)
+    if is_valid:
+        # Possibly override the decision
+        if ">" in smiles:
+            # Ignore reactions
+            return False
+
+        if _H_as_atom.search(smiles):
+            # In non-strict mode, 'H' is accepted as an element.
+            # Pretend that OpenEye didn't accept it
+            return False
+
+        if smiles in _oechem_should_not_accept_these:
+            # I think there's an OEChem parser failure. It shouldn't accept this.
+            return False
+
+        if smiles in ("tttttttttttttttttttttttttttttttt",  # sequence of tritium hydrogens
+                      "BR",  # Syracuse SMILES
+                      "sd",  # sulpher and deuterium
+                      ):
+            return False
+
+    return is_valid
+    
+
+############## Check that RDKit accepts the SMILES
+
+def is_valid_rdkit(smiles):
+    return True
+
+Chem = None
+def enable_rdkit(disable_messages):
+    global is_valid_rdkit, Chem
+    from rdkit import Chem
+    is_valid_rdkit = real_is_valid_rdkit
+    if disable_messages:
+        from rdkit import RDLogger
+        RDLogger.DisableLog("rdApp.info")
+        RDLogger.DisableLog("rdApp.warning")
+        RDLogger.DisableLog("rdApp.error")
+
+def real_is_valid_rdkit(smiles):
+    return bool(Chem.MolFromSmiles(smiles))
+
+
+############## Check that OpenBabel accepts the SMILES
+
+def is_valid_openbabel(smiles):
+    return True
+
+_obconv = None
+_obmol = None
+def enable_openbabel(disable_messages):
+    global _obconv, _obmol, is_valid_openbabel
+    import openbabel as ob
+    _obmol = ob.OBMol()
+    _obconv = ob.OBConversion()
+    assert _obconv.SetInFormat("smi"), "Cannot set format to SMILES"
+    is_valid_openbabel = real_is_valid_openbabel
+
+    if disable_messages:
+        from openbabel import obErrorLog
+        obErrorLog.StopLogging()
+
+_openbabel_should_not_accept_these = set((
+    # in exact.dat.bz2
+    "C(",
+    "c1cc(C)ccc1N=(O)=O",
+    "(N(=0)(=0)c1ccccc1COC(=O)",
+    "=C/C(=O)CC(=O)/C=",
+    "CC(C)C(=O)Nc1ncc(s1)S(=O)(=",
+    "C\\\\C(O)=C(/C#N)C(=O)Nc1ccc(cc1)C(F)(F)F",
+    "O=",
+    "Br-.Cn1c(-c2ccccc2)c3cc(N)ccc3c4ccc(N)cc14",
+    "CCN1c2cc(ccc2N(C)C(O)c3cccnc13)NNN-",
+    "CCN1c2ncc(CCc3cccc(c3)NNN-)cc2C(O)N(C)c4ccc(Cl)nc14",
+
+    # substructure.dat
+    "(cccccc)",
+
+    # similar.dat.bz2
+    "CC(=O)N[C@@H]1[C@@H](O)C=C(O[C@H]1[C@H](",
+    r"O(c1ccc(cc1)/C(c2ccc(O)cc2)=C(\\c3ccccc3)CC)CCN(C)C",
+    r"C-#O",
+    ))
+    
+def real_is_valid_openbabel(smiles):
+    _obmol.Clear()
+    is_valid = _obconv.ReadString(_obmol, smiles)
+    if is_valid:
+        # CSC1Nc2ccccc2-c3c(C1)c4cc(ccc4n3C)N(O-)O in exact.dat.bz2 is one of many
+        if "(O-)" in smiles:
+            return False
+
+        # Don't know how to handle the space in similar.dat.bz2:
+        #  Fc1ccccc1COC(C(=O)NC3c2ccccc2CC3O)C(O)C( O)C(OCc4ccccc4F)C(=O)NC6c5ccccc5CC6O
+        # This gets converted to:
+        #  Fc1ccccc1COC(C(=O)NC3c2ccccc2CC3O)C(O)C(
+        # which isn't a valid SMILES, but OpenBabel accepts it.
+        # There are many SMILES in similar.dat.bz2 which are broken by this strange space.
+        if smiles[-1:] in "(=/#-":
+            return False
+
+        if smiles in _openbabel_should_not_accept_these:
+            return False
+        
+    return is_valid
+
+
+##############
+
+def is_valid_smiles(smiles):
+    if (not is_valid_opensmiles(smiles) and
+        # The OpenSMILES Ragel parser doesn't handle 'te'. RDKit and OEChem do.
+        smiles != "COc1ccccc1N2CCN(CCCCNC(=O)c3cc4ccccc4[te]3)CC2" and
+        smiles != "Clc1cccc(N2CCN(CCCCNC(=O)c3cc4ccccc4[te]3)CC2)c1Cl"
+        ):
+
+        ## # Verified that OpenSMILES-Ragel rejects molecules that RDKit doesn't support.
+        if real_is_valid_rdkit(smiles):
+            raise AssertionError("RDKit says %r is real" % (smiles,))
+        
+        ## # Verified that OpenSMILES-Ragel rejects molecules that OEChem doesn't support.
+        ## # (I hand-checked the failed SMILES which OEChem does support, and
+        ## # verified that they should be ignored.)
+        ##
+        if lenient_is_valid_oechem(smiles):
+            raise AssertionError("OEChem/lenient says %r is real" % (smiles,))
+
+        if strict_is_valid_oechem(smiles):
+            raise AssertionError("OEChem/strict says %r is real" % (smiles,))
+
+        ## # Verified that OpenSMILES-Ragel rejects molecules that OpenBabel doesn't support.
+        ## # (I hand-checked the failed SMILES which OpenBabel does support, and
+        ## # verified that they should be ignored.)
+        ## 
+        ## if real_is_valid_openbabel(smiles):
+        ##     raise AssertionError("OpenBabel says %r is real\n" % (smiles,))
+        
+        return False, "opensmiles"
+    
+    if not is_valid_oechem(smiles):
+        return False, "oechem"
+
+    if not is_valid_rdkit(smiles):
+        return False, "rdkit"
+    
+    if not is_valid_openbabel(smiles):
+        return False, "openbabel"
+    
+    return True, None
+
+def main():
+    args = parser.parse_args()
+    if (args.opensmiles is None and
+        args.oechem is None and
+        args.rdkit is None and
+        args.openbabel is None):
+        parser.error("Must specify at least one of --oechem, --rdkit, --openbabel or --opensmiles")
+
+    filename = args.filename[0]
+
+    options = []
+    for argname in "opensmiles oechem rdkit openbabel".split():
+        if getattr(args, argname):
+            options.append(argname)
+    if options:
+        sys.stderr.write("Using: %s\n" % (" ".join(options)))
+    else:
+        sys.stderr.write("Basic SMILES validation only")
+
+    if args.opensmiles:
+        enable_opensmiles(args.smiles_counts_path)
+
+    if args.oechem:
+        enable_oechem(disable_messages = args.quiet)
+        
+    if args.rdkit:
+        enable_rdkit(disable_messages = args.quiet)
+
+    if args.openbabel:
+        enable_openbabel(disable_messages = args.quiet)
+
+    if args.output is None:
+        outfile = sys.stdout
+    else:
+        outfile = open(args.output, "w")
+    ################### Start processing!
+
+    known_bad = set([
+        "",  # empty lines
+        "null",
+        "undefined",
+        "n/a",
+        ])
+
+    seen = dict()
+        
+    for line in open_filename(filename):
+        line = line.strip()
+        if line in known_bad:
+            continue
+
+        if ("SELECT cd_id" in line or
+            "select cd_id" in line):
+            continue
+
+        line = restore_pluses(line)
+
+        smiles = line.split()[0]
+
+        if smiles in seen:
+            is_good = seen[smiles]
+        else:
+            is_good, reason = is_valid_smiles(smiles)
+            if not is_good and args.failed:
+                sys.stderr.write("Failed at %s: %s\n" % (reason, smiles))
+                
+            seen[smiles] = is_good
+
+        if is_good:
+            outfile.write(smiles + "\n")
+
+if __name__ == "__main__":
+    main()