Source code for schrodinger.livedesign.preprocessor

import copy
import datetime
import enum
import math
import re
from typing import Any
from typing import Callable
from typing import NamedTuple
from typing import Optional
from typing import Tuple

import decorator
import numpy as np
from rdkit import Chem
from rdkit.Chem import SaltRemover
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdMolAlign
from rdkit.Chem import rdmolops
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.MolStandardize import rdMolStandardize

import schrodinger
from schrodinger import adapter
from schrodinger import rdkit_extensions
from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.livedesign import convert
from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import log

try:
    from schrodinger.application.epik import TautomerizerException
    from schrodinger.application.epik import epik_tautomer_canonicalizer
except ImportError as e:
    epik_tautomer_canonicalizer = e

logger = log.get_output_logger(__file__)

audit_changes_log = None


[docs]def initialize_audit_log(verbose: bool): """Initialize global audit for logging purposes""" global audit_changes_log audit_changes_log = [] if verbose else None
# statically store the MolVS class used in NEUTRALIZE here so it is only # created and initialized once: uncharger = rdMolStandardize.Uncharger(canonicalOrder=True) # string constants for mol props ATOM_PROP_MOL_PARITY = 'molParity' ATOM_PROP_ORIGINAL_INDEX = '_AtomOriginalIndex' ATOM_PROP_DUMMY_LABEL = 'dummyLabel' ATOM_PROP_ATOM_LABEL = 'atomLabel' ATOM_PROP_MAP_NUMBER = 'molAtomMapNumber' BOND_PROP_BOND_STEREO = "_MolFileBondStereo" BOND_PROP_BOND_CONFIG = "_MolFileBondCfg" BOND_PROP_BOND_TYPE = '_MolFileBondType' BOND_PROP_END_PTS = "_MolFileBondEndPts" MOL_PROP_LINK_NODES = "_molLinkNodes" MOL_PROP_ATTACHPT = "molAttchpt" MOL_PROP_CHIRAL_FLAG = "_MolFileChiralFlag" MOL_PROP_R_LABEL = "_MolFileRLabel" MOL_PROP_UNKNOWN_STEREO = "_UnknownStereo" LD_PROP_FORCE_COORDGEN = "b_ld_alwayscoordgen" MOL_PROP_ON = 1 MOL_PROP_OFF = 0 _SANITIZE_OPS = Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES ^ Chem.SANITIZE_CLEANUP ^ Chem.SANITIZE_CLEANUPCHIRALITY ^ Chem.SANITIZE_FINDRADICALS STD_DEFAULT_SALTS = ( "[Cl,Br,I]", "[Li,Na,K,Ca,Mg]", "[O,N]", "[N](=O)(O)O", "[P](=O)(O)(O)O", "[P](F)(F)(F)(F)(F)F", "[S](=O)(=O)(O)O", "[CH3][S](=O)(=O)(O)", # p-Toluene sulfonate "c1cc([CH3])ccc1[S](=O)(=O)(O)", # Acetic acid "[CH3]C(=O)O", # TFA "FC(F)(F)C(=O)O", # Fumarate/Maleate "OC(=O)C=CC(=O)O", # Oxalate "OC(=O)C(=O)O", # Tartrate "OC(=O)C(O)C(O)C(=O)O", # Dicylcohexylammonium "C1CCCCC1[NH]C1CCCCC1", ) STD_TRANSFORMATIONS = ( # Nitro and N-oxide (pentavalent N) "[#7+0;v5:1]=[#8+0:2]>>[#7+:1]-[#8-:2]", # Phosphonium Ylide "[#6:3][P+:1]([#6:4])([#6:5])[#6-:2]>>[#6:3][P-0:1]([#6:4])([#6:5])=[#6+0:2]", # Incorrect (but unambiguous) Phosphonium Ylide observed in ChEMBL "[#6:3][P-:1]([#6:4])([#6:5])[#6+:2]>>[#6:3][P-0:1]([#6:4])([#6:5])=[#6+0:2]", # Sulfoxide "[#6:3][S+:1]([#6:4])-[#8-:2]>>[#6:3][S;X3+0:1]([#6:4])=[#8-0:2]", # Phosphonate "[#6:3][P+:1]([#8;X2:4])([#8;X2:5])[#8-:2]>>[#6:3][P+0:1]([#8:4])([#8:5])=[#8-0:2]", # Sulfone "[#6:3][S+:1]([#6:4])([#8-:2])=[O:5]>>[#6:3][S+0:1]([#6:4])(=[#8-0:2])=[O:5]", # Azide "[#7;A;X2-:1][N;X2+:2]#[N;X1:3]>>[#7-0:1]=[N+:2]=[#7-:3]", # Diazo "[#6;X3-:1][N;X2+:2]#[N;X1:3]>>[#6-0;A:1]=[N+:2]=[#7-:3]", ) IMINE_QUERY = Chem.MolFromSmarts(r'[*]=[N;X2;h0][H]')
[docs]class ExplicitHydrogens(enum.Enum): REMOVE_ALL = enum.auto() KEEP_WEDGED = enum.auto() ADD_ALL = enum.auto() AS_IS = enum.auto() ON_HETERO_AND_KEEP_WEDGED = enum.auto()
[docs]class GenerateCoordinates(enum.Enum): NONE = enum.auto() FULL = enum.auto() FULL_ALIGNED = enum.auto()
[docs]class ChiralFlag0Meaning(enum.Enum): UNGROUPED_ARE_ABSOLUTE = enum.auto() # ABS (most customers choose this) UNGROUPED_ARE_RACEMIC = enum.auto() # AND (BIOVIA standard) UNGROUPED_ARE_RELATIVE = enum.auto() # OR
[docs]class PreprocessorOptions(NamedTuple): """ Options dictating preprocessor actions :cvar KEEP_ONLY_LARGEST_STRUCTURE: whether additional discontiguous structures are removed :cvar REMOVE_PROPERTIES: whether properties are removed :cvar STRIP_SALTS: whether salts are removed :cvar RESOLVE_AMBIGUOUS_TAUTOMERS: whether to guess tautomeric state for ambiguous input that would otherwise fail with a kekulization error """ # Filter options MAX_NUM_ATOMS: Optional[int] = None # Chemical identity options KEEP_ONLY_LARGEST_STRUCTURE: bool = True STRIP_SALTS: Optional[Tuple[str]] = None # after removing small molecules NEUTRALIZE: bool = True TRANSFORMATIONS: Optional[Tuple[str]] = STD_TRANSFORMATIONS CHOOSE_CANONICAL_TAUTOMER: bool = False RESOLVE_AMBIGUOUS_TAUTOMERS: bool = False # Stereochemistry options CHIRAL_FLAG_0_MEANING: ChiralFlag0Meaning = ChiralFlag0Meaning.UNGROUPED_ARE_ABSOLUTE STRIP_STEREO_ABSOLUTE_GROUP: bool = True STRIP_AND_GROUPS_ON_SINGLE_ATOM: bool = False # Property options # These properties don't affect deduplication, but the properties and s-group data # from the first instance of a compound registered will be stored in LD REMOVE_PROPERTIES: bool = False # Depiction-related options # These will not affect deduplication GENERATE_COORDINATES: GenerateCoordinates = GenerateCoordinates.FULL_ALIGNED EXPLICIT_HYDROGENS: ExplicitHydrogens = ExplicitHydrogens.REMOVE_ALL CLEAR_INVALID_WEDGE_BONDS: bool = True HEAVY_HYDROGEN_DT: bool = False @staticmethod def _updateConfig(config: dict): """ :param dict: configuration to validate/update deprecated options """ release_version = schrodinger.get_release_name() remove_msg = ("is not a supported configuration option in the " f"{release_version} version of the Schrodinger " "Preprocessor and must be removed") # Validate/fix deprecated options key = "CORE_SUITE_NEUTRALIZE" if config.get(key): raise KeyError(f"{key} {remove_msg}") config.pop(key, None) key = "GENERATE_V3K_SDF" if config.get(key) is False: # aka v2000 raise KeyError(f"{key} {remove_msg}") config.pop(key, None) key = "DOUBLE_BOND_STEREO_STANDARD" if config.get(key, "CROSSED") != "CROSSED": raise KeyError(f"{key} {remove_msg}") config.pop(key, None) key = "REMOVE_SGROUP_DATA" if config.get(key, "NONE") != "NONE": raise KeyError(f"{key} {remove_msg}") config.pop(key, None) key = "FORCE_ABSOLUTE_STEREOCHEMISTRY" if key in config: raise KeyError(f"{key} {remove_msg}; see CHIRAL_FLAG_0_MEANING") key = "RING_REPRESENTATION" if config.get(key, "KEKULE") != "KEKULE": raise KeyError(f"{key} {remove_msg}") config.pop(key, None) # Completely remove, old value no longer has any effect config.pop("STRIP_STEREO_ABSOLUTE_GROUP", None) # Update deprecated option formats key = "STRIP_SALTS" value = config.get(key) if isinstance(value, dict): config[key] = value["SALTS"].split() if value["STRIP"] else [] key = "TRANSFORMATIONS" value = config.get(key) if value and isinstance(next(iter(value), None), dict): config[key] = [transform["STRUCTURE"] for transform in value] key = "CHIRAL_FLAG_TREATMENT" if key in config: raise KeyError(f"{key} {remove_msg}; see CHIRAL_FLAG_0_MEANING") clean_wedge_orientation = config.get('CLEAN_WEDGE_ORIENTATION') coordgen_enabled = config.get('GENERATE_COORDINATES', 'FULL_ALIGNED') != 'NONE' if clean_wedge_orientation is not None and clean_wedge_orientation != coordgen_enabled: raise KeyError(f'CLEAN_WEDGE_ORIENTATION {remove_msg}; wedge ' 'positions depend on coordinate generation') config.pop('CLEAN_WEDGE_ORIENTATION', None) # If coordgen is enabled, we ignore input wedge bonds if not config.get('CLEAR_INVALID_WEDGE_BONDS', True) and coordgen_enabled: raise ValueError("CLEAR_INVALID_WEDGE_BONDS is always True if " "if coordinate generation is on.") explicit_H = config.get("EXPLICIT_HYDROGENS") if not coordgen_enabled and explicit_H == 'ADD_ALL': raise ValueError("EXPLICIT_HYDROGENS=ADD_ALL is not allowed when " "coordinate generation is off.") if not coordgen_enabled and explicit_H == 'REMOVE_ALL': raise ValueError("EXPLICIT_HYDROGENS=REMOVE_ALL is not allowed " "when coordinate generation is off, because it " "may lead to a loss of stereochemistry.")
[docs] @staticmethod def fromConfig(config: dict): """ :param config: configuration from which to build options :raise KeyError: if an unknown key is present :raise ValueError: if an unknown value is present """ PreprocessorOptions._updateConfig(config) key_to_enum = { "EXPLICIT_HYDROGENS": ExplicitHydrogens, "GENERATE_COORDINATES": GenerateCoordinates, "CHIRAL_FLAG_0_MEANING": ChiralFlag0Meaning, } config = copy.deepcopy(config) for key, value in config.items(): if key in key_to_enum: try: config[key] = key_to_enum[key][value] except KeyError: raise ValueError(f"{key} does not support {value}") # Assure all boolean values are set with boolean types for key, default in PreprocessorOptions().toConfig().items(): value = config.get(key, default) if isinstance(default, bool) and not isinstance(value, bool): raise ValueError(f"{key} must have be a boolean value: {value}") try: return PreprocessorOptions(**config) except TypeError as e: raise KeyError(e)
[docs] def toConfig(self) -> dict: config = self._asdict() for key, value in config.items(): if isinstance(value, enum.Enum): config[key] = value.name return config
[docs]def remove_invalid_config_options(config: dict) -> Tuple[str]: """ :param config: configuration from which to build options, from which all invalid keys and values will be stripped :return: tuple of errors encountered """ errors = [] attempts = len(config) for i in range(attempts): try: PreprocessorOptions.fromConfig(config) except (KeyError, ValueError) as exc: errors.append(str(exc)) bad_key = str(exc).strip('"\'').split()[0] del config[bad_key] return tuple(errors)
[docs]@decorator.decorator def audit_changes(func: Callable, mol: Chem.rdchem.Mol, *args): """ When the global audit_changes_log is initialized, compares mol CXSMILES before and after the given function call, capturing information when the CXSMILES has been changed. :param func: transformation function :param mol: molecule to apply transformation to """ if audit_changes_log is None: return func(mol, *args) ext_smiles_format = rdkit_extensions.Format.EXTENDED_SMILES pre_cxsmiles = rdkit_extensions.rdmol_to_text(mol, ext_smiles_format) mol = func(mol, *args) post_cxsmiles = rdkit_extensions.rdmol_to_text(mol, ext_smiles_format) if pre_cxsmiles != post_cxsmiles: changes = {} changes["timestamp"] = f"{datetime.datetime.now().ctime()}" changes["operation"] = f"{func.__name__}{args}" changes["structure"] = post_cxsmiles audit_changes_log.append(changes) return mol
[docs]def getprop(getter: Callable, value: str, default: Any = None) -> Any: try: return getter(value) except KeyError: return default
[docs]def is_wildcard(atom): """Is atom a wildcard?""" return atom.DescribeQuery().strip() == "AtomNull"
[docs]def is_queryatom_exception(atom): """ Normally we raise an exception if query atoms are in the molecule to be preprocessed, but we don't want to do that if the atom is an attachment point :param atom: the atom to check :returns: whether or not the atom is allowed in the preprocessor """ # must be a '*' atom to be an attachment point or in a polymer if not is_wildcard(atom): return False if getprop(atom.GetProp, ATOM_PROP_ATOM_LABEL, '').startswith('_AP'): return True return False
[docs]def coords_all_zero(conf): """ Returns whether or not all atom positions in a conformer are zero """ ps = conf.GetPositions() return np.alltrue(np.abs(ps) < 1e-4)
[docs]@audit_changes def setup_mol(mol): """ Setup on a molecule that is always done regardless of configuration. :param mol: An unsanitized RDKit Mol :returns: A partially sanitized RDKit mol, ready for the standardizer. """ # Don't sanitize kekulization until after fixing hydrogens on aromatic nitrogen ops = _SANITIZE_OPS ^ Chem.SANITIZE_KEKULIZE Chem.SanitizeMol(mol, sanitizeOps=ops) mol.UpdatePropertyCache(strict=False) # For mols with all zero-coordinates we will prime chiral tags # from the atom parities and not recalculate them again. mol = assign_zero_coords_chirality(mol) assign_stereochemistry(mol) return mol
[docs]@audit_changes def check_kekulization(mol, options): if options.CHOOSE_CANONICAL_TAUTOMER or options.RESOLVE_AMBIGUOUS_TAUTOMERS: mol = convert.add_hs_to_aromatic_nitrogen(mol) # do not attempt to sanitize with kekulization before # fixing! throwing a KekulizeException seems to leave # the bonds in a bad state (all single bonds) Chem.SanitizeMol(mol, sanitizeOps=_SANITIZE_OPS) return mol
[docs]def assign_zero_coords_chirality(mol): """ Molecules with all-zero coordinates need to have the "chirality tags" primed from the atom parity properties. Once these are in place, we remove the conformer, and leave the mol in a state equivalent to one that came from a SMILES input. """ try: conf = mol.GetConformer() except ValueError: pass else: if coords_all_zero(conf): Chem.AssignAtomChiralTagsFromMolParity(mol) mol.RemoveAllConformers() return mol
[docs]def assign_stereochemistry(mol): try: conf = mol.GetConformer() except ValueError: pass else: if not coords_all_zero(conf): # If we have all-zero coordinates, we don't care about # the 2D/3D flag (which is read, rather than checked) if conf.Is3D(): # If we have 3D coordinates, calculate both chirality # and stereo bonds from them. Chem.AssignStereochemistryFrom3D(mol) return else: # If we have 2D coordinates available, use them # to refresh stereo bond directions Chem.DetectBondStereoChemistry(mol, conf) # The general case, valid both no conformation and calculate # stereochemistry from parity, and stereo bonds from bond directions. Chem.AssignStereochemistry(mol)
[docs]@audit_changes def correct_sgroup_coordinates(mol): """ If coordinates are generated, make sure the FIELDDISP property in the Sgroups are using relative coordinates. """ s_groups = Chem.GetMolSubstanceGroups(mol) Chem.ClearMolSubstanceGroups(mol) for s_group in s_groups: properties = s_group.GetPropsAsDict() if s_group.HasProp("FIELDDISP"): new_fielddisp = re.sub(r'[0-9\- .]*D[AR]([U ])', r' 0.0 0.0 DR\1', s_group.GetProp("FIELDDISP")) s_group.SetProp("FIELDDISP", new_fielddisp) Chem.AddMolSubstanceGroup(mol, s_group) return mol
[docs]def preprocess_molblock(molblock: str, config: Optional[dict] = None) -> str: """ Standardizes an MDL molblock :param molblock: input molblock :param config: dict specifying preprocessor options :return: standardized molblock """ mol = convert.sdf_to_rdkit(molblock) options = PreprocessorOptions.fromConfig(config) if config else None out_mol = preprocess(mol, options) return convert_to_molblock(out_mol, options)
[docs]@rdkit_adapter.suppress_rdkit_log() def preprocess( mol: Chem.rdchem.Mol, options: Optional[PreprocessorOptions] = None) -> Chem.rdchem.Mol: """ Standardizes an RDKit mol :param mol: input mol :param options: preprocessor options :return: standardized mol """ options = options or PreprocessorOptions() assert_not_blinded(mol, max_num_atoms=options.MAX_NUM_ATOMS) assert_not_query(mol) # Make sure the 3D flag matches the input coordinates before any Is3D calls try: conf = mol.GetConformer() except ValueError: pass else: has_3d_coords = any(xyz[2] != 0.0 for xyz in conf.GetPositions()) if conf.Is3D() and not has_3d_coords: conf.Set3D(False) mol = setup_mol(mol) mol = check_kekulization(mol, options) # grab the initial MDL chiral flag value initial_chiral_flag = getprop(mol.GetUnsignedProp, MOL_PROP_CHIRAL_FLAG, MOL_PROP_OFF) # Pop so we can later reapply with update indices from transformations initial_stereo_groups = mol.GetStereoGroups() substance_groups = Chem.GetMolSubstanceGroups(mol) Chem.ClearMolSubstanceGroups(mol) if options.REMOVE_PROPERTIES: mol = remove_properties(mol) # This needs to happen before any transformations that might change any bond/atom indexes. original_bond_mapping = tag_mol_indexes(mol) # clarification as to why we're removing Hs when ADD_ALL is set: # if we are going to be adding Hs later anyway, we might as # well remove them now in order to simplify the rest of the process # # In order to maintain the current Hs, EXPLICIT_HYDROGENS should be # set to "AS_IS" if not options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS: keep_wedged = options.EXPLICIT_HYDROGENS in ( ExplicitHydrogens.KEEP_WEDGED, ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED) mol = remove_explicit_hydrogens(mol, substance_groups, keep_wedged=keep_wedged) if options.STRIP_SALTS: mol = strip_salts(mol, salt_list=options.STRIP_SALTS) if options.KEEP_ONLY_LARGEST_STRUCTURE: mol = remove_fragments(mol, substance_groups) if options.NEUTRALIZE: mol = neutralize(mol, options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS) # Both transformations and tautomerization can "move" double bonds, # so we need to recalculate stereo if they cause any changes. # Also, tautomerization might reverse some of the transformations # we did, so run that first. if options.CHOOSE_CANONICAL_TAUTOMER: mol = generate_canonical_tautomer(mol) mol = apply_transformations(mol, options.TRANSFORMATIONS) # basically - who is doing the depiction - us or the input? coordgen_enabled = options.GENERATE_COORDINATES in ( GenerateCoordinates.FULL, GenerateCoordinates.FULL_ALIGNED) # To be removed with LDIDEAS-5327; for now, force coordinate generation if # the property indicating this compound was sketched is present coordgen_enabled = getprop(mol.GetBoolProp, value=LD_PROP_FORCE_COORDGEN, default=coordgen_enabled) mol.ClearProp(LD_PROP_FORCE_COORDGEN) # Force coordgen if input is 3D try: coordgen_enabled = coordgen_enabled or mol.GetConformer().Is3D() except ValueError: pass if not coordgen_enabled: rdkit_extensions.reapply_molblock_wedging(mol) elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.ADD_ALL: mol = add_explicit_hydrogens(mol) elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED: mol = add_chiral_hs(mol) mol = add_explicit_hydrogens(mol, only_on_hetero=True) elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.KEEP_WEDGED: # if we are going to be setting the wedge orientation, we need # to add chiral Hs now, before coordinates are generated. Then # we set the bond wedging after generating coordinates mol = add_chiral_hs(mol) # This should happen after all transformations that can alter # atom and bond indexes. SGroups are reapplied onto the mol here. mol = update_mol_groups(mol, initial_stereo_groups, substance_groups, original_bond_mapping) # After the call to update_mol_groups(), substance_groups are no longer # in sync with mol, and should not be used agin. del substance_groups if options.CLEAR_INVALID_WEDGE_BONDS: mol = clear_wedge_bonds_from_achiral_centers(mol) # Calculating the enhanced stereochemistry should happen # after clear_wedge_bonds_from_achiral_centers(), since it does # recalculate stereo, and we might find/remove some chiral centers mol = calculate_enhanced_stereo(mol, options.CHIRAL_FLAG_0_MEANING, initial_chiral_flag) if options.STRIP_AND_GROUPS_ON_SINGLE_ATOM: mol = strip_stereo_and(mol) if coordgen_enabled: align = options.GENERATE_COORDINATES is GenerateCoordinates.FULL_ALIGNED mol = generate_coordinates(mol, align) mol = correct_sgroup_coordinates(mol) # now that coordinates are set, we need to assign bond wedging mol = wedge_clean(mol) if options.EXPLICIT_HYDROGENS in ( ExplicitHydrogens.KEEP_WEDGED, ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED): # Make sure we remove all non-wedged H again. # SGroups have been updated, so reread them. sgroups = Chem.GetMolSubstanceGroups(mol) keep_hetero = options.EXPLICIT_HYDROGENS == ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED mol = remove_explicit_hydrogens(mol, sgroups, keep_wedged=True, keep_hetero=keep_hetero) mol = remove_wiggly_bonds_around_double_bonds(mol) return mol
[docs]class BlindedCompoundError(ValueError): pass
[docs]def assert_not_blinded(mol: Chem.rdchem.Mol, max_num_atoms: Optional[int] = None): """ Checks imcoming mol to confirm it has real atoms; if it doesn't it may have been intentionally stripped by the caller. LiveDesign marks these structures as having been "blinded", meaning a customer may have IP/legal restrictions, or there's a delay in registering the structure despite having assay data available. Currently, LiveDesign handles these structures by keeping a row in the LiveReport, but without an associated SDF or image. This is different from other registration errors, which are simply archived. :param mol: RDKit Mol to consider :param max_mol_wt: maximum allowed molecular weight """ num_atoms = mol.GetNumAtoms() if num_atoms == 0: raise BlindedCompoundError("No atoms present") if all(atom.GetAtomicNum() == 0 for atom in mol.GetAtoms()): raise BlindedCompoundError("Only dummy atoms present") if max_num_atoms is not None and num_atoms > max_num_atoms: raise BlindedCompoundError( f"{num_atoms} atoms present; exceeds {max_num_atoms} atom limit")
[docs]def assert_not_query(mol: Chem.rdchem.Mol): """ Checks incoming mol to confirm there are no query features present on atoms or bonds, which would otherwise make it not compatible with registration. :param mol: RDKit Mol to consider """ if mol.HasProp(MOL_PROP_LINK_NODES): raise ValueError("Link node molecule query feature found") has_polymer = any( convert.is_polymer(sg) for sg in Chem.GetMolSubstanceGroups(mol)) for bond in mol.GetBonds(): if bond.HasProp(BOND_PROP_END_PTS): raise ValueError("Variable attachment bond found") if bond.HasQuery(): raise ValueError("Bond query feature found") for atom in mol.GetAtoms(): if atom.HasQuery(): if atom.HasProp(MOL_PROP_R_LABEL): raise ValueError("Atom with R Label found") if not is_queryatom_exception(atom) and not has_polymer: raise ValueError("Atom query feature found")
[docs]def get_atoms_mapping(mol_atoms): atom_idx_mapping = {} for at in mol_atoms: try: atom_idx_mapping[at.GetUnsignedProp( ATOM_PROP_ORIGINAL_INDEX)] = at.GetIdx() except KeyError: pass return atom_idx_mapping
[docs]def get_bonds_mapping(mol, original_bond_mapping, atom_idx_mapping): bond_idx_mapping = {} for bnd_idx, bnd_atom_old_idxs in original_bond_mapping.items(): try: bnd_atom_new_idxs = [ atom_idx_mapping[old_idx] for old_idx in bnd_atom_old_idxs ] except KeyError: # Any of the atoms does not exist continue new_bnd = mol.GetBondBetweenAtoms(*bnd_atom_new_idxs) if new_bnd is None: # The atoms exist, but are no longer bonded continue bond_idx_mapping[bnd_idx] = new_bnd.GetIdx() return bond_idx_mapping
[docs]def tag_mol_indexes(mol): """ Tag atoms with the initial indexes on the mol and create a bond mapping. Bonds are less stable than atoms, so we create an external mapping to the atoms they bind """ for atom in mol.GetAtoms(): atom.SetUnsignedProp(ATOM_PROP_ORIGINAL_INDEX, atom.GetIdx()) bond_mapping = { bnd.GetIdx(): (bnd.GetBeginAtomIdx(), bnd.GetEndAtomIdx()) for bnd in mol.GetBonds() } return bond_mapping
def _map_indexes(mapping, original_idxs): original_idxs = set(original_idxs) ret = set() for idx in original_idxs: try: new_idx = mapping[idx] except KeyError: pass else: ret.add(new_idx) return sorted(ret), original_idxs != ret
[docs]def check_attachment_points_changed(sg, atom_idx_mapping): for attachpt in sg.GetAttachPoints(): # if aIdx has been removed, then the SGroup doesn't make sense, # and we should drop it try: new_aIdx = atom_idx_mapping[attachpt.aIdx] except KeyError: raise ValueError('Attach point invalidated') else: if new_aIdx != attachpt.aIdx: return True # lvIdx might be a H atom we made implicit try: new_lvIdx = atom_idx_mapping[attachpt.lvIdx] except KeyError: return True else: if new_lvIdx != attachpt.lvIdx: return True return False
[docs]def check_cstate_changed(sg, bond_idx_mapping): for cstate in sg.GetCStates(): # Drop the SGroup if the CState bond has been removed. try: new_bond = bond_idx_mapping[cstate.bondIdx] except KeyError: raise ValueError('CState invalidated') else: if new_bond != cstate.bondIdx: return True return False
[docs]def update_sgroup_indexes(sg, sg_atoms, sg_parent_atoms, sg_bonds, atom_idx_mapping, bond_idx_mapping): sg.SetAtoms(sg_atoms) sg.SetParentAtoms(sg_parent_atoms) sg.SetBonds(sg_bonds) attachpts = sg.GetAttachPoints() sg.ClearAttachPoints() for attachpt in attachpts: aIdx = atom_idx_mapping[attachpt.aIdx] # lvIdx might be a H atom we made implicit try: lvIdx = atom_idx_mapping[attachpt.lvIdx] except KeyError: lvIdx = 0 sg.AddAttachPoint(aIdx, lvIdx, attachpt.id) cstates = sg.GetCStates() sg.ClearCStates() for cstate in cstates: bond_idx = bond_idx_mapping[cstate.bondIdx] sg.AddCState(bond_idx, cstate.vector)
[docs]def update_mol_groups(mol, stereo_groups, substance_groups, original_bond_mapping): """ Update atoms and bonds in stereo and substance groups, dropping any atoms/groups that are no longer valid for the current state of the mol. """ if not stereo_groups and not substance_groups: return mol atom_idx_mapping = get_atoms_mapping(mol.GetAtoms()) bond_idx_mapping = get_bonds_mapping(mol, original_bond_mapping, atom_idx_mapping) if stereo_groups: mol = update_stereo_groups(mol, stereo_groups, atom_idx_mapping) if substance_groups: mol = update_sgroups(mol, substance_groups, atom_idx_mapping, bond_idx_mapping) return mol
[docs]@audit_changes def update_sgroups(mol, substance_groups, atom_idx_mapping, bond_idx_mapping): """ Update SGroups to reflect the transformations done on mol, updating with new atom and bond indexes, as well as atoms that might have been added or removed. """ for sg in substance_groups: sg_atoms, atoms_changed = _map_indexes(atom_idx_mapping, sg.GetAtoms()) sg_parent_atoms, parent_atoms_changed = _map_indexes( atom_idx_mapping, sg.GetParentAtoms()) sg_bonds, bonds_changed = _map_indexes(bond_idx_mapping, sg.GetBonds()) try: attp_changed = check_attachment_points_changed(sg, atom_idx_mapping) cstate_changed = check_cstate_changed(sg, bond_idx_mapping) except ValueError: continue # If any of the lists changed, we need to update the SGroup if any((atoms_changed, parent_atoms_changed, bonds_changed, attp_changed, cstate_changed)): # If we removed everything that the SGroup contained, then drop it. if len(sg_atoms) == len(sg_parent_atoms) == len(sg_bonds) == 0: continue # Add the SGroup to the updated mol so that checks are made against it sg = Chem.AddMolSubstanceGroup(mol, sg) update_sgroup_indexes(sg, sg_atoms, sg_parent_atoms, sg_bonds, atom_idx_mapping, bond_idx_mapping) else: Chem.AddMolSubstanceGroup(mol, sg) return mol
[docs]@audit_changes def update_stereo_groups(mol, stereo_groups, atom_idx_mapping): rwmol = Chem.RWMol(mol) new_groups = [] for group in stereo_groups: old_indexes = [at.GetIdx() for at in group.GetAtoms()] new_indexes, _ = _map_indexes(atom_idx_mapping, old_indexes) if new_indexes: new_groups.append( Chem.CreateStereoGroup(group.GetGroupType(), rwmol, new_indexes)) rwmol.SetStereoGroups(new_groups) return rwmol.GetMol()
[docs]@audit_changes def add_explicit_hydrogens(mol, only_on_hetero=False): if only_on_hetero: hetero_atoms = [ at.GetIdx() for at in mol.GetAtoms() if at.GetAtomicNum() != 6 ] return rdmolops.AddHs(mol, onlyOnAtoms=hetero_atoms, explicitOnly=False, addCoords=True) return rdmolops.AddHs(mol, explicitOnly=False, addCoords=True)
[docs]@audit_changes def remove_explicit_hydrogens(mol, sgroups, keep_wedged=False, keep_hetero=False): # we use this to flag Hs that should not be removed. # there's no reasonable scenario in which an H atom # in a molecule could have an isotope value of 10, so # this should be safe save_isotope = 10 ps = Chem.RemoveHsParameters() ps.updateExplicitCount = True ps.removeWithWedgedBond = not keep_wedged # Always remove double bond stereo Hs, except for imines, # which we handle below. In any other case, the double bond # stereo direction is either incorrect (e.g. 2 H neighbors), # or can be flipped to the other neighbor (in sdf, double bond # neighbors cannot be flagged to specifically carry the direction, # so we can't tell if a H is meant to be kept) ps.removeDefiningBondStereo = True saved_hs = False if keep_wedged: for atom in mol.GetAtoms(): if atom.GetAtomicNum() == 1 and atom.GetIsotope() == 0: for bnd in atom.GetBonds(): # why this is as complex as it is: # Information about bond wedging is stored as a bond property after the CTAB # is parsed. Because the values have different meanings, the V2000 and V3000 # parsers use different property names. # The values here correspond to bonds which are wedged, hashed, or "squiggled" # above comment valid for at least the 2020.03 RDKit release if getprop(bnd.GetUnsignedProp, BOND_PROP_BOND_STEREO) in (1, 6, 4) or \ getprop(bnd.GetUnsignedProp,BOND_PROP_BOND_CONFIG) in (1, 2, 3): atom.SetIsotope(save_isotope) saved_hs = True break if keep_hetero and bnd.GetOtherAtom( atom).GetAtomicNum() != 6: atom.SetIsotope(save_isotope) saved_hs = True break # Keep imine stereogenic Hs to preserver stereo for imine_match in mol.GetSubstructMatches(IMINE_QUERY): if imine_match: _, n_atom, h_atom = imine_match bnd = mol.GetBondBetweenAtoms(n_atom, h_atom) if bnd.GetBondDir() == Chem.BondDir.ENDDOWNRIGHT or \ bnd.GetBondDir() == Chem.BondDir.ENDUPRIGHT: mol.GetAtomWithIdx(h_atom).SetIsotope(save_isotope) saved_hs = True # hydrogens that are an in an XBOND in a polymer sgroup should not be # removed, or else RDKit will remove the entire sgroup for sgroup in sgroups: if not convert.is_polymer(sgroup): continue for bnd_idx in sgroup.GetBonds(): bnd = mol.GetBondWithIdx(bnd_idx) for atom in [bnd.GetBeginAtom(), bnd.GetEndAtom()]: if atom.GetAtomicNum() == 1 and atom.GetIsotope() == 0: atom.SetIsotope(save_isotope) saved_hs = True res = rdmolops.RemoveHs(mol, ps, sanitize=False) if saved_hs: # Isotopes were set on the input mol, and they are copied to the result mol. # We want the input to be unchanged, so we remove isotopes from both mols. for m in (res, mol): for atom in m.GetAtoms(): if atom.GetAtomicNum() == 1 and atom.GetIsotope( ) == save_isotope: atom.SetIsotope(0) return res
[docs]def convert_to_molblock(mol, options=None): """ Convert processed mol into a molblock and make necessary updates. """ processed_molblock = convert.rdkit_to_sdf(mol) options = options or PreprocessorOptions() if options.HEAVY_HYDROGEN_DT: processed_molblock = convert_heavy_hydrogens(processed_molblock) return processed_molblock
[docs]def convert_heavy_hydrogens(molblock): """ NOTE that this operates on a molblock, not a molecule The RDKit does not currently (v2020.03) support writing D or T to mol blocks, so we need to post-process the text. Fortunately it's an easy regex in v3000 mol blocks. This does not work with V2000 mol blocks, so we throw a ValueError there. This doesn't seem like a big deal since V2000 support is primarly being kept around for debugging purposes. If we need to eventually support V2000+HEAVY_HYDROGEN_DT, some not-completely-trivial code will need to be written. """ d_expr = re.compile(r"^(M V30 [0-9]+) H (.+?) MASS=2([^\n]*)", flags=re.MULTILINE) out_molblock = d_expr.sub(r"\1 D \2\3", molblock) t_expr = re.compile(r"^(M V30 [0-9]+) H (.+?) MASS=3([^\n]*)", flags=re.MULTILINE) out_molblock = t_expr.sub(r"\1 T \2\3", out_molblock) return out_molblock
[docs]@audit_changes def neutralize(mol, checkForProblematicHs=False): res = uncharger.uncharge(mol) res.UpdatePropertyCache(strict=False) if checkForProblematicHs: # NOTE the logic here really only applies to the organic subset. # it's not going to do well with other element types, but hopefully # no-one is doing neutralization on species with non-organic atoms # and EXPLICIT_HS set to "AS_IS" if res.GetNumAtoms() != mol.GetNumAtoms(): raise RuntimeError("uncharging changed the number of atoms") for i in range(res.GetNumAtoms()): oAt = mol.GetAtomWithIdx(i) oChg = oAt.GetFormalCharge() if oChg > 0: nAt = res.GetAtomWithIdx(i) nChg = nAt.GetFormalCharge() if nChg != oChg: # positive charges on atoms with Hs should have had an H removed, but # we can't currently do that if oAt.GetTotalNumHs( includeNeighbors=True) - oAt.GetTotalNumHs(): raise ValueError( "could not neutralize positive charged atom with explicit H neighbors" ) return res
[docs]def unicode_to_str(unicode_str): """ Takes a unicode object and converts it to a str (utf-8). If the arg is already a str, returns unicode_str (i.e. if run with python 3). Needed to support python 2/3 with unicode_literals. py2: type<unicode> -> type<str utf-8> py3: type<str utf-8> (no unicode type exists) :type unicode_str: unicode (py2) or str (py3) :param unicode_str: the unicode that potentially needs converting (i.e. if run with python 2) :return: str """ if isinstance(unicode_str, str): return unicode_str return unicode_str.encode("utf-8")
[docs]@audit_changes def transform(mol, transformation): """ apply the transformation to the molecule repeatedly until it no longer applies. the maxTransformations argument is just there to prevent us from ending up in an infinite loop due to a bogus transformation Please note that running transformations may alter the stereochemistry of mol, so a stereo recalculation from coordinates might be required. """ max_transformations = 1000 smarts = unicode_to_str(transformation) rxn = rdChemReactions.ReactionFromSmarts(smarts) for _ in range(max_transformations): output = rxn.RunReactants((mol,)) if output: mol = output[0][0] mol.UpdatePropertyCache(strict=False) else: # When we get here, the reaction can no longer be applied # on whatever mol is at that moment, so don't try again break return mol
[docs]def in_xy_plane(mol): if mol.GetNumConformers() == 0: return False return not mol.GetConformer().Is3D()
[docs]@audit_changes def generate_coordinates(mol, align=False): if mol.GetNumAtoms() < 2: # we don't really need to deal with coordgen if there's only a single atom mol.RemoveAllConformers() conf = Chem.Conformer(mol.GetNumAtoms()) conf.Set3D(False) mol.AddConformer(conf) return mol # keep original mol to align with original_mol = Chem.Mol(mol) Chem.rdCoordGen.AddCoords(mol) # align input mol and mol with new coords if align and in_xy_plane(mol) and in_xy_plane(original_mol): rdMolAlign.AlignMol(mol, original_mol) # determine new coordinates of polymer brackets mol = convert.clean_up_polymer_brackets(mol, original_mol) return mol
def _find_matches(mol, st): """Which atoms match?""" # it's possible that atoms were added or deleted. We hope that this is only # hydrogens rdk_non_hydrogens = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() != 1) schro_non_hydrogens = sum(1 for a in st.atom if a.atomic_number != 1) if rdk_non_hydrogens != schro_non_hydrogens: msg = (f"Schrodinger structure doesn't match RDKit mol, they have " f"different numbers of non-hydrogen atoms (RDKit: " f"{rdk_non_hydrogens} != Schrodinger: {schro_non_hydrogens})") raise rdkit_adapter.InconsistentStructureError(msg) matches = {} for a in st.atom: try: idx = a.property[adapter.RDK_INDEX] except KeyError: continue rdkit_atom = mol.GetAtomWithIdx(idx) if rdkit_atom: matches[a] = rdkit_atom return matches _BOND_TYPES_S2R = { structure.BondType.Single: Chem.BondType.SINGLE, structure.BondType.Double: Chem.BondType.DOUBLE, structure.BondType.Triple: Chem.BondType.TRIPLE, structure.BondType.Zero: Chem.BondType.ZERO, structure.BondType.Dative: Chem.BondType.DATIVE } def _move_bond_orders(st, editmol, matches): """Move bond orders and parity from Schrodinger to RDKit""" for bond in st.bond: a1, a2 = bond.atom1, bond.atom2 rdk_a1 = matches.get(a1) rdk_a2 = matches.get(a2) if not rdk_a1 or not rdk_a2: continue rdbond = editmol.GetBondBetweenAtoms(rdk_a1.GetIdx(), rdk_a2.GetIdx()) new_type = _BOND_TYPES_S2R[bond.type] if new_type != rdbond.GetBondType(): rdbond.SetBondType(new_type) if new_type == Chem.BondType.SINGLE: # clear the bond direction if a double bond became a single bond rdbond.SetBondDir(Chem.BondDir.NONE)
[docs]def copy_lewis_structure_and_hydrogens(st, mol): """ Applies bond orders and charges from st to mol. Updates #implicitH to match Assumes st includes all hydrogens. Adds implicit and explicit hydrogens to the mol, but does not add any graph hydrogens. May remove graph hydrogens. """ editmol = Chem.RWMol(mol) matches = _find_matches(editmol, st) _move_bond_orders(st, editmol, matches) # Adjust charges - assumes that the mmct charge is good. for mmct_atom, rdkit_atom in matches.items(): charge = mmct_atom.formal_charge if charge != rdkit_atom.GetFormalCharge(): rdkit_atom.SetFormalCharge(charge) Chem.SanitizeMol(editmol, sanitizeOps=Chem.SANITIZE_ADJUSTHS) graph_hydrogens_to_delete = [] for mmct_atom, rdkit_atom in matches.items(): mmct_hydrogens = sum( (1 for a in mmct_atom.bonded_atoms if a.atomic_number == 1)) mmct_hydrogens += mm.mmat_get_nhua(mmct_atom.atom_type) # hydrogen count is implicit + explicit (shown in smiles) + graph hydrogens rdk_hydrogens = rdkit_atom.GetTotalNumHs(includeNeighbors=True) # A hydrogen was added or removed on the Schrodinger side if rdk_hydrogens != mmct_hydrogens: new_explicit_h_count = rdkit_atom.GetNumExplicitHs( ) + mmct_hydrogens - rdk_hydrogens if new_explicit_h_count >= 0: rdkit_atom.SetNumExplicitHs(new_explicit_h_count) else: # Need to delete graph hydrogens hydrogens = [ a.GetIdx() for a in rdkit_atom.GetNeighbors() if a.GetAtomicNum() == 1 ] remove_count = rdk_hydrogens - mmct_hydrogens assert len(hydrogens) >= remove_count for h_index in hydrogens[:remove_count]: graph_hydrogens_to_delete.append(h_index) graph_hydrogens_to_delete.sort(reverse=True) for atom in graph_hydrogens_to_delete: editmol.RemoveAtom(atom) Chem.SanitizeMol(editmol, sanitizeOps=_SANITIZE_OPS) mol = editmol.GetMol() mol.UpdatePropertyCache(strict=False) return mol
[docs]@audit_changes def generate_canonical_tautomer(mol): st = rdkit_adapter.from_rdkit(mol, include_properties=False, include_stereo=False) try: st = epik_tautomer_canonicalizer.TautomerCanonicalizer().apply(st)[0] except TautomerizerException as e: logger.warning(f"failed finding a canonical tautomer: {str(e)}") return mol return copy_lewis_structure_and_hydrogens(st, mol)
[docs]@audit_changes def clear_wedge_bonds_from_achiral_centers(mol): modified = False new_mol = Chem.Mol(mol) Chem.AssignStereochemistry(new_mol, cleanIt=True, force=True) for orig_at, new_at in zip(mol.GetAtoms(), new_mol.GetAtoms()): if orig_at.GetChiralTag() != new_at.GetChiralTag(): modified = True break if not modified: for orig_bnd, new_bnd in zip(mol.GetBonds(), new_mol.GetBonds()): if (orig_bnd.GetStereo() != new_bnd.GetStereo() or orig_bnd.GetBondDir() != new_bnd.GetBondDir()): modified = True break return new_mol
[docs]@audit_changes def calculate_enhanced_stereo(mol, enh_stereo_default_grouping, initial_chiral_flag): # Place any chiral atom not in an enhanced stereo group into an enhanced # stereo group with type determined by ChiralFlag0Meaning enhanced_stereo_atoms = [ at.GetIdx() for sg in mol.GetStereoGroups() for at in sg.GetAtoms() ] chiral_atoms = [ at.GetIdx() for at in mol.GetAtoms() if at.GetChiralTag() in (Chem.ChiralType.CHI_TETRAHEDRAL_CCW, Chem.ChiralType.CHI_TETRAHEDRAL_CW) and not (at.GetIdx() in enhanced_stereo_atoms) ] if len(chiral_atoms) != 0: rwmol = Chem.RWMol(mol) stereo_groups = [sg for sg in rwmol.GetStereoGroups()] if initial_chiral_flag == MOL_PROP_ON or enh_stereo_default_grouping == ChiralFlag0Meaning.UNGROUPED_ARE_ABSOLUTE: for idx, sg in enumerate(stereo_groups): if sg.GetGroupType() == Chem.StereoGroupType.STEREO_ABSOLUTE: # chiral atoms should be added to existing enhanced stereo group chiral_atoms.extend([at.GetIdx() for at in sg.GetAtoms()]) stereo_groups.pop(idx) break stereo_groups.append( Chem.CreateStereoGroup(Chem.StereoGroupType.STEREO_ABSOLUTE, rwmol, chiral_atoms)) elif enh_stereo_default_grouping == ChiralFlag0Meaning.UNGROUPED_ARE_RACEMIC: stereo_groups.append( Chem.CreateStereoGroup(Chem.StereoGroupType.STEREO_AND, rwmol, chiral_atoms)) else: stereo_groups.append( Chem.CreateStereoGroup(Chem.StereoGroupType.STEREO_OR, rwmol, chiral_atoms)) rwmol.SetStereoGroups(stereo_groups) mol = rwmol.GetMol() # SHARED-8608: Write chiral flag as 0 in the preprocessor unless all # enhanced stereo groups are abs sg_is_abs = [ sg.GetGroupType() == Chem.StereoGroupType.STEREO_ABSOLUTE for sg in mol.GetStereoGroups() ] if len(sg_is_abs) > 0 and all(sg_is_abs): mol.SetUnsignedProp(MOL_PROP_CHIRAL_FLAG, MOL_PROP_ON) else: mol.ClearProp(MOL_PROP_CHIRAL_FLAG) return mol
[docs]@audit_changes def strip_stereo_and(input_mol): """ Removes any Stereo AND groups with only one center and flattens the bonds around it :param input_mol: The original molecule to consider :return: post-processed molecule, if the input molecule was modified """ mol = Chem.RWMol(input_mol) keep = [] for stereo_group in mol.GetStereoGroups(): if stereo_group.GetGroupType() != Chem.StereoGroupType.STEREO_AND: keep.append(stereo_group) elif len(stereo_group.GetAtoms()) > 1: keep.append(stereo_group) else: # flatten the atom, At this point the stereo_group only has one atom atom = stereo_group.GetAtoms()[0] atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED) # make sure that all bond wedging information has been removed around that atom too: for bond in atom.GetBonds(): if bond.GetBeginAtomIdx() == atom.GetIdx(): bond.ClearProp(BOND_PROP_BOND_STEREO ) # what we'd get from a V2000 mol file bond.ClearProp(BOND_PROP_BOND_CONFIG ) # what we'd get from a V3000 mol file mol.SetStereoGroups(keep) return Chem.Mol(mol)
[docs]def frag_is_smaller(atoms, largest_atoms, weight, largest_weight, smiles, largest_smiles): """ A fragment is considered larger if its atoms/weight are larger, the length of the smiles string is larger, or the smiles string is lexicographically *smaller* if they are equal length. ie, 'AAA' is larger than 'AAB'.. hence the final smiles > largest_smiles check here to reject """ if atoms < largest_atoms: return True if atoms > largest_atoms: return False if weight < largest_weight: return True if weight > largest_weight: return False if len(smiles) < len(largest_smiles): return True if len(smiles) > len(largest_smiles): return False return smiles > largest_smiles
[docs]def connect_variable_attachment_points(mol): """ forms zero-order bonds between one of the atoms of a bond with an ATTACH property to the "main" molecule in order to have the molecule+variable attachment point treated as a single fragment returns a 2-tuple with: 1. the modified molecule 2. whether or not the molecule was modified """ bonds_added = False res = None for bond in list(mol.GetBonds()): if bond.HasProp("_MolFileBondAttach") and bond.HasProp( "_MolFileBondEndPts"): # these should look like: ENDPTS=(3 4 6 5) endp_txt = bond.GetProp("_MolFileBondEndPts") # do a bit of validation that the format is correct: if endp_txt[0] != "(" or endp_txt[-1] != ")": logger.warning("invalid ENDPTS format ignored") continue endps = [int(x) for x in endp_txt[1:-1].split(" ")] # do a bit of validation that the format is correct: if len(endps) != endps[0] + 1 or endps[0] == 1: logger.warning("invalid ENDPTS format ignored") continue batom_idx = bond.GetBeginAtomIdx() oatom_idx = endps[1] - 1 if res is None: res = Chem.RWMol(mol) if res.GetBondBetweenAtoms(batom_idx, oatom_idx): logger.warning("variable attachment point already connected") continue res.AddBond(batom_idx, oatom_idx, Chem.BondType.ZERO) # set a property on the bond so that we can find it again: res.GetBondBetweenAtoms(batom_idx, oatom_idx).SetIntProp( "variable_attach_bond", 1) bonds_added = True if res is None: res = mol return res, bonds_added
[docs]@audit_changes def remove_fragments(mol, substance_groups): """ Fragments are not removed if the molecule contains any SGroups which are associated with polymers Use the following criteria to remove unwanted fragments from *mol*: 1. keep only the fragment which has the most number of atoms 2. break ties by keeping only fragments with the greatest molecular weight 3. break ties with the longest smiles string 4. break additional ties by keeping the fragment with the earliest alpha sorted SMILES string If two or more identical fragments remain after 1-4, we will throw a fatal error. """ for sg in substance_groups: if getprop(sg.GetProp, "TYPE") in ( "MUL", "SRU", "MON", "COP", "CRO", "GRA", "MIX", "FOR", "MOD", ): logger.debug( "Molecule has polymeric SGroups, skipping remove_fragments.") return mol nmol, bonds_added = connect_variable_attachment_points(mol) frags = Chem.GetMolFrags(nmol, asMols=True, sanitizeFrags=False) if len(frags) < 2: return mol largest_frag = None largest_atoms = math.inf largest_weight = math.inf largest_smiles = None duplicates = 0 for frag in frags: [atoms, weight, smiles] = [ frag.GetNumHeavyAtoms(), MolWt(frag), Chem.MolToSmiles(frag), ] if largest_frag is not None: if frag_is_smaller(atoms, largest_atoms, weight, largest_weight, smiles, largest_smiles): continue # If all properties are equivalent, we have a *potential* duplicate of the largest known fragment # but if they are not, we've discovered a new, bigger fragment, and may discard any known dupes if (atoms == largest_atoms and weight == largest_weight and smiles == largest_smiles): duplicates += 1 else: duplicates = 0 largest_frag = frag largest_atoms = atoms largest_weight = weight largest_smiles = smiles else: largest_frag = frag largest_atoms = atoms largest_weight = weight largest_smiles = smiles if duplicates: message = ( "At the end of REMOVE_FRAGMENTS, we had {duplicates} identical " "fragments left over, returning only one.") logger.debug(message.format(duplicates=duplicates)) if bonds_added: largest_frag = Chem.RWMol(largest_frag) bnds = list(largest_frag.GetBonds()) for bnd in bnds: if bnd.HasProp("variable_attach_bond"): largest_frag.RemoveBond(bnd.GetBeginAtomIdx(), bnd.GetEndAtomIdx()) largest_frag = largest_frag.GetMol() return largest_frag
[docs]@audit_changes def remove_properties(mol): for prop_name in mol.GetPropNames(): mol.ClearProp(prop_name) return mol
[docs]@audit_changes def strip_salts(mol, salt_list): salt_list_str = "\n".join(salt_list) remover = SaltRemover.SaltRemover(defnData=salt_list_str) res = remover.StripMol(mol, sanitize=False) if res.GetNumAtoms() == 0: raise ValueError("Removing salts produced an empty structure") return res
[docs]def apply_transformations(mol, transformations): """ Apply the given list of transformations, and recalculate stereo if at least one transformation applies. """ for tform in transformations: mol = transform(mol, tform) # Remove properties leftover after RunReactants transformations for atom in mol.GetAtoms(): for prop in (ATOM_PROP_MAP_NUMBER, "old_mapno", "react_atom_idx"): if atom.HasProp(prop): atom.ClearProp(prop) Chem.SanitizeMol(mol, sanitizeOps=_SANITIZE_OPS) # If any reactions applied, we need to rescan stereo from coordinates # because bond orders & stereo bonds might have changed assign_stereochemistry(mol) return mol
[docs]@audit_changes def add_chiral_hs(mol): # need to filter chiral_atoms to get only atoms with endocyclic stereobonds # since we don't want to add hydrogens to chiral atoms that already have # exocyclic stereobonds chiral_atoms = [center[0] for center in Chem.FindMolChiralCenters(mol)] chiral_atoms_needing_hydrogens = [] for atom_idx in chiral_atoms: atom = mol.GetAtomWithIdx(atom_idx) # only mark this for adding hydrogens if there are no exocyclic bonds add_atom = all(bond.IsInRing() for bond in atom.GetBonds()) if add_atom: chiral_atoms_needing_hydrogens.append(atom_idx) # return mol if there are no chiral atoms to be adjusted if not chiral_atoms_needing_hydrogens: return mol return Chem.AddHs( mol, explicitOnly=False, addCoords=True, onlyOnAtoms=chiral_atoms_needing_hydrogens, )
[docs]@audit_changes def wedge_clean(mol): if not mol.GetNumConformers(): return mol # resetting the BondDir and UnknownStereo setting on each bond # so WedgeMolBonds can do its magic for bond in mol.GetBonds(): if bond.GetBondType() == Chem.BondType.SINGLE and bond.IsInRing(): if bond.GetBondDir() == Chem.BondDir.UNKNOWN: bond.SetIntProp(MOL_PROP_UNKNOWN_STEREO, MOL_PROP_ON) bond.SetBondDir(Chem.BondDir.NONE) # bonds which were explicitly marked as unknown: if getprop(bond.GetUnsignedProp, BOND_PROP_BOND_STEREO) == 4 or \ getprop(bond.GetUnsignedProp, BOND_PROP_BOND_CONFIG) == 2: # ensure that the bond doesn't start at an atom with an specified double # bond disqualify = False for obond in bond.GetBeginAtom().GetBonds(): if (obond.GetBondType() == Chem.BondType.DOUBLE and obond.GetStereo() != Chem.BondStereo.STEREONONE): disqualify = True break if not disqualify: bond.SetBondDir(Chem.BondDir.UNKNOWN) Chem.WedgeMolBonds(mol, mol.GetConformer()) return mol
[docs]@audit_changes def remove_wiggly_bonds_around_double_bonds(mol): # We just need to make sure that there any wiggly bonds starting at atoms # involved in double bonds don't end up in the output for bond in mol.GetBonds(): if bond.GetBondType() == Chem.BondType.SINGLE and ( getprop(bond.GetUnsignedProp, BOND_PROP_BOND_STEREO) == 4 or getprop(bond.GetUnsignedProp, BOND_PROP_BOND_CONFIG) == 2): # it's a wiggly bond. Check to see if it starts at an atom with a double bond startsAtDouble = False for obond in bond.GetBeginAtom().GetBonds(): if (obond.GetBondType() == Chem.BondType.DOUBLE and obond.GetStereo() != Chem.BondStereo.STEREONONE): startsAtDouble = True break if startsAtDouble: if bond.HasProp(BOND_PROP_BOND_STEREO): bond.ClearProp(BOND_PROP_BOND_STEREO) if bond.HasProp(BOND_PROP_BOND_CONFIG): bond.ClearProp(BOND_PROP_BOND_CONFIG) bond.SetBondDir(Chem.BondDir.NONE) return mol