Source code for schrodinger.livedesign.preprocessor

import copy
import datetime
import enum
import math
import re
from typing import Any
from typing import Callable
from typing import NamedTuple
from typing import Optional
from typing import Tuple

import decorator
import numpy as np
from rdkit import Chem
from rdkit.Chem import SaltRemover
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdMolAlign
from rdkit.Chem import rdmolops
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.MolStandardize import rdMolStandardize

import schrodinger
from schrodinger import adapter
from schrodinger import rdkit_extensions
from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.livedesign import convert
from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import log

try:
    from schrodinger.application.epik import TautomerizerException
    from schrodinger.application.epik import epik_tautomer_canonicalizer
except ImportError as e:
    epik_tautomer_canonicalizer = e

logger = log.get_output_logger(__file__)

audit_changes_log = None


[docs]def initialize_audit_log(verbose: bool):
    """Initialize global audit for logging purposes"""
    global audit_changes_log
    audit_changes_log = [] if verbose else None


# statically store the MolVS class used in NEUTRALIZE here so it is only
# created and initialized once:
uncharger = rdMolStandardize.Uncharger(canonicalOrder=True)

# string constants for mol props
ATOM_PROP_MOL_PARITY = 'molParity'
ATOM_PROP_ORIGINAL_INDEX = '_AtomOriginalIndex'
ATOM_PROP_DUMMY_LABEL = 'dummyLabel'
ATOM_PROP_ATOM_LABEL = 'atomLabel'
ATOM_PROP_MAP_NUMBER = 'molAtomMapNumber'
BOND_PROP_BOND_STEREO = "_MolFileBondStereo"
BOND_PROP_BOND_CONFIG = "_MolFileBondCfg"
BOND_PROP_BOND_TYPE = '_MolFileBondType'
BOND_PROP_END_PTS = "_MolFileBondEndPts"
MOL_PROP_LINK_NODES = "_molLinkNodes"
MOL_PROP_ATTACHPT = "molAttchpt"
MOL_PROP_CHIRAL_FLAG = "_MolFileChiralFlag"
MOL_PROP_R_LABEL = "_MolFileRLabel"
MOL_PROP_UNKNOWN_STEREO = "_UnknownStereo"
LD_PROP_FORCE_COORDGEN = "b_ld_alwayscoordgen"
MOL_PROP_ON = 1
MOL_PROP_OFF = 0

_SANITIZE_OPS = Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES ^ Chem.SANITIZE_CLEANUP ^ Chem.SANITIZE_CLEANUPCHIRALITY ^ Chem.SANITIZE_FINDRADICALS

STD_DEFAULT_SALTS = (
    "[Cl,Br,I]",
    "[Li,Na,K,Ca,Mg]",
    "[O,N]",
    "[N](=O)(O)O",
    "[P](=O)(O)(O)O",
    "[P](F)(F)(F)(F)(F)F",
    "[S](=O)(=O)(O)O",
    "[CH3][S](=O)(=O)(O)",
    # p-Toluene sulfonate
    "c1cc([CH3])ccc1[S](=O)(=O)(O)",
    # Acetic acid
    "[CH3]C(=O)O",
    # TFA
    "FC(F)(F)C(=O)O",
    # Fumarate/Maleate
    "OC(=O)C=CC(=O)O",
    # Oxalate
    "OC(=O)C(=O)O",
    # Tartrate
    "OC(=O)C(O)C(O)C(=O)O",
    # Dicylcohexylammonium
    "C1CCCCC1[NH]C1CCCCC1",
)

STD_TRANSFORMATIONS = (
    # Nitro and N-oxide (pentavalent N)
    "[#7+0;v5:1]=[#8+0:2]>>[#7+:1]-[#8-:2]",
    # Phosphonium Ylide
    "[#6:3][P+:1]([#6:4])([#6:5])[#6-:2]>>[#6:3][P-0:1]([#6:4])([#6:5])=[#6+0:2]",
    # Incorrect (but unambiguous) Phosphonium Ylide observed in ChEMBL
    "[#6:3][P-:1]([#6:4])([#6:5])[#6+:2]>>[#6:3][P-0:1]([#6:4])([#6:5])=[#6+0:2]",
    # Sulfoxide
    "[#6:3][S+:1]([#6:4])-[#8-:2]>>[#6:3][S;X3+0:1]([#6:4])=[#8-0:2]",
    # Phosphonate
    "[#6:3][P+:1]([#8;X2:4])([#8;X2:5])[#8-:2]>>[#6:3][P+0:1]([#8:4])([#8:5])=[#8-0:2]",
    # Sulfone
    "[#6:3][S+:1]([#6:4])([#8-:2])=[O:5]>>[#6:3][S+0:1]([#6:4])(=[#8-0:2])=[O:5]",
    # Azide
    "[#7;A;X2-:1][N;X2+:2]#[N;X1:3]>>[#7-0:1]=[N+:2]=[#7-:3]",
    # Diazo
    "[#6;X3-:1][N;X2+:2]#[N;X1:3]>>[#6-0;A:1]=[N+:2]=[#7-:3]",
)

IMINE_QUERY = Chem.MolFromSmarts(r'[*]=[N;X2;h0][H]')


[docs]class ExplicitHydrogens(enum.Enum):
    REMOVE_ALL = enum.auto()
    KEEP_WEDGED = enum.auto()
    ADD_ALL = enum.auto()
    AS_IS = enum.auto()
    ON_HETERO_AND_KEEP_WEDGED = enum.auto()


[docs]class GenerateCoordinates(enum.Enum):
    NONE = enum.auto()
    FULL = enum.auto()
    FULL_ALIGNED = enum.auto()


[docs]class ChiralFlag0Meaning(enum.Enum):
    UNGROUPED_ARE_ABSOLUTE = enum.auto()  # ABS (most customers choose this)
    UNGROUPED_ARE_RACEMIC = enum.auto()  # AND (BIOVIA standard)
    UNGROUPED_ARE_RELATIVE = enum.auto()  # OR


[docs]class PreprocessorOptions(NamedTuple):
    """
    Options dictating preprocessor actions

    :cvar KEEP_ONLY_LARGEST_STRUCTURE: whether additional discontiguous
        structures are removed
    :cvar REMOVE_PROPERTIES: whether properties are removed
    :cvar STRIP_SALTS: whether salts are removed
    :cvar RESOLVE_AMBIGUOUS_TAUTOMERS: whether to guess tautomeric state for
        ambiguous input that would otherwise fail with a kekulization error
    """
    # Filter options
    MAX_NUM_ATOMS: Optional[int] = None

    # Chemical identity options
    KEEP_ONLY_LARGEST_STRUCTURE: bool = True
    STRIP_SALTS: Optional[Tuple[str]] = None  # after removing small molecules
    NEUTRALIZE: bool = True
    TRANSFORMATIONS: Optional[Tuple[str]] = STD_TRANSFORMATIONS
    CHOOSE_CANONICAL_TAUTOMER: bool = False
    RESOLVE_AMBIGUOUS_TAUTOMERS: bool = False

    # Stereochemistry options
    CHIRAL_FLAG_0_MEANING: ChiralFlag0Meaning = ChiralFlag0Meaning.UNGROUPED_ARE_ABSOLUTE
    STRIP_STEREO_ABSOLUTE_GROUP: bool = True
    STRIP_AND_GROUPS_ON_SINGLE_ATOM: bool = False

    # Property options
    # These properties don't affect deduplication, but the properties and s-group data
    # from the first instance of a compound registered will be stored in LD
    REMOVE_PROPERTIES: bool = False

    # Depiction-related options
    # These will not affect deduplication
    GENERATE_COORDINATES: GenerateCoordinates = GenerateCoordinates.FULL_ALIGNED
    EXPLICIT_HYDROGENS: ExplicitHydrogens = ExplicitHydrogens.REMOVE_ALL
    CLEAR_INVALID_WEDGE_BONDS: bool = True
    HEAVY_HYDROGEN_DT: bool = False

    @staticmethod
    def _updateConfig(config: dict):
        """
        :param dict: configuration to validate/update deprecated options
        """

        release_version = schrodinger.get_release_name()
        remove_msg = ("is not a supported configuration option in the "
                      f"{release_version} version of the Schrodinger "
                      "Preprocessor and must be removed")

        # Validate/fix deprecated options
        key = "CORE_SUITE_NEUTRALIZE"
        if config.get(key):
            raise KeyError(f"{key} {remove_msg}")
        config.pop(key, None)

        key = "GENERATE_V3K_SDF"
        if config.get(key) is False:  # aka v2000
            raise KeyError(f"{key} {remove_msg}")
        config.pop(key, None)

        key = "DOUBLE_BOND_STEREO_STANDARD"
        if config.get(key, "CROSSED") != "CROSSED":
            raise KeyError(f"{key} {remove_msg}")
        config.pop(key, None)

        key = "REMOVE_SGROUP_DATA"
        if config.get(key, "NONE") != "NONE":
            raise KeyError(f"{key} {remove_msg}")
        config.pop(key, None)

        key = "FORCE_ABSOLUTE_STEREOCHEMISTRY"
        if key in config:
            raise KeyError(f"{key} {remove_msg}; see CHIRAL_FLAG_0_MEANING")

        key = "RING_REPRESENTATION"
        if config.get(key, "KEKULE") != "KEKULE":
            raise KeyError(f"{key} {remove_msg}")
        config.pop(key, None)

        # Completely remove, old value no longer has any effect
        config.pop("STRIP_STEREO_ABSOLUTE_GROUP", None)

        # Update deprecated option formats
        key = "STRIP_SALTS"
        value = config.get(key)
        if isinstance(value, dict):
            config[key] = value["SALTS"].split() if value["STRIP"] else []

        key = "TRANSFORMATIONS"
        value = config.get(key)
        if value and isinstance(next(iter(value), None), dict):
            config[key] = [transform["STRUCTURE"] for transform in value]

        key = "CHIRAL_FLAG_TREATMENT"
        if key in config:
            raise KeyError(f"{key} {remove_msg}; see CHIRAL_FLAG_0_MEANING")

        clean_wedge_orientation = config.get('CLEAN_WEDGE_ORIENTATION')
        coordgen_enabled = config.get('GENERATE_COORDINATES',
                                      'FULL_ALIGNED') != 'NONE'
        if clean_wedge_orientation is not None and clean_wedge_orientation != coordgen_enabled:
            raise KeyError(f'CLEAN_WEDGE_ORIENTATION {remove_msg}; wedge '
                           'positions depend on coordinate generation')
        config.pop('CLEAN_WEDGE_ORIENTATION', None)

        # If coordgen is enabled, we ignore input wedge bonds
        if not config.get('CLEAR_INVALID_WEDGE_BONDS',
                          True) and coordgen_enabled:
            raise ValueError("CLEAR_INVALID_WEDGE_BONDS is always True if "
                             "if coordinate generation is on.")

        explicit_H = config.get("EXPLICIT_HYDROGENS")
        if not coordgen_enabled and explicit_H == 'ADD_ALL':
            raise ValueError("EXPLICIT_HYDROGENS=ADD_ALL is not allowed when "
                             "coordinate generation is off.")
        if not coordgen_enabled and explicit_H == 'REMOVE_ALL':
            raise ValueError("EXPLICIT_HYDROGENS=REMOVE_ALL is not allowed "
                             "when coordinate generation is off, because it "
                             "may lead to a loss of stereochemistry.")

[docs]    @staticmethod
    def fromConfig(config: dict):
        """
        :param config: configuration from which to build options
        :raise KeyError: if an unknown key is present
        :raise ValueError: if an unknown value is present
        """
        PreprocessorOptions._updateConfig(config)

        key_to_enum = {
            "EXPLICIT_HYDROGENS": ExplicitHydrogens,
            "GENERATE_COORDINATES": GenerateCoordinates,
            "CHIRAL_FLAG_0_MEANING": ChiralFlag0Meaning,
        }
        config = copy.deepcopy(config)
        for key, value in config.items():
            if key in key_to_enum:
                try:
                    config[key] = key_to_enum[key][value]
                except KeyError:
                    raise ValueError(f"{key} does not support {value}")

        # Assure all boolean values are set with boolean types
        for key, default in PreprocessorOptions().toConfig().items():
            value = config.get(key, default)
            if isinstance(default, bool) and not isinstance(value, bool):
                raise ValueError(f"{key} must have be a boolean value: {value}")

        try:
            return PreprocessorOptions(**config)
        except TypeError as e:
            raise KeyError(e)

[docs]    def toConfig(self) -> dict:
        config = self._asdict()
        for key, value in config.items():
            if isinstance(value, enum.Enum):
                config[key] = value.name
        return config


[docs]def remove_invalid_config_options(config: dict) -> Tuple[str]:
    """
    :param config: configuration from which to build options, from which
        all invalid keys and values will be stripped
    :return: tuple of errors encountered
    """
    errors = []
    attempts = len(config)
    for i in range(attempts):
        try:
            PreprocessorOptions.fromConfig(config)
        except (KeyError, ValueError) as exc:
            errors.append(str(exc))
            bad_key = str(exc).strip('"\'').split()[0]
            del config[bad_key]
    return tuple(errors)


[docs]@decorator.decorator
def audit_changes(func: Callable, mol: Chem.rdchem.Mol, *args):
    """
    When the global audit_changes_log is initialized, compares mol CXSMILES
    before and after the given function call, capturing information when the
    CXSMILES has been changed.

    :param func: transformation function
    :param mol: molecule to apply transformation to
    """

    if audit_changes_log is None:
        return func(mol, *args)

    ext_smiles_format = rdkit_extensions.Format.EXTENDED_SMILES
    pre_cxsmiles = rdkit_extensions.rdmol_to_text(mol, ext_smiles_format)
    mol = func(mol, *args)
    post_cxsmiles = rdkit_extensions.rdmol_to_text(mol, ext_smiles_format)

    if pre_cxsmiles != post_cxsmiles:
        changes = {}
        changes["timestamp"] = f"{datetime.datetime.now().ctime()}"
        changes["operation"] = f"{func.__name__}{args}"
        changes["structure"] = post_cxsmiles
        audit_changes_log.append(changes)

    return mol


[docs]def getprop(getter: Callable, value: str, default: Any = None) -> Any:
    try:
        return getter(value)
    except KeyError:
        return default


[docs]def is_wildcard(atom):
    """Is atom a wildcard?"""
    return atom.DescribeQuery().strip() == "AtomNull"


[docs]def is_queryatom_exception(atom):
    """
    Normally we raise an exception if query atoms are in the molecule to be preprocessed,
    but we don't want to do that if the atom is an attachment point

    :param atom: the atom to check
    :returns: whether or not the atom is allowed in the preprocessor
    """
    # must be a '*' atom to be an attachment point or in a polymer
    if not is_wildcard(atom):
        return False

    if getprop(atom.GetProp, ATOM_PROP_ATOM_LABEL, '').startswith('_AP'):
        return True

    return False


[docs]def coords_all_zero(conf):
    """

    Returns whether or not all atom positions in a conformer are zero
    """
    ps = conf.GetPositions()
    return np.alltrue(np.abs(ps) < 1e-4)


[docs]@audit_changes
def setup_mol(mol):
    """
    Setup on a molecule that is always done regardless of configuration.

    :param mol: An unsanitized RDKit Mol
    :returns: A partially sanitized RDKit mol, ready for the standardizer.
    """
    # Don't sanitize kekulization until after fixing hydrogens on aromatic nitrogen
    ops = _SANITIZE_OPS ^ Chem.SANITIZE_KEKULIZE
    Chem.SanitizeMol(mol, sanitizeOps=ops)
    mol.UpdatePropertyCache(strict=False)

    # For mols with all zero-coordinates we will prime chiral tags
    # from the atom parities and not recalculate them again.
    mol = assign_zero_coords_chirality(mol)

    assign_stereochemistry(mol)

    return mol


[docs]@audit_changes
def check_kekulization(mol, options):
    if options.CHOOSE_CANONICAL_TAUTOMER or options.RESOLVE_AMBIGUOUS_TAUTOMERS:
        mol = convert.add_hs_to_aromatic_nitrogen(mol)

    # do not attempt to sanitize with kekulization before
    # fixing! throwing a KekulizeException seems to leave
    # the bonds in a bad state (all single bonds)
    Chem.SanitizeMol(mol, sanitizeOps=_SANITIZE_OPS)
    return mol


[docs]def assign_zero_coords_chirality(mol):
    """
    Molecules with all-zero coordinates need to have the "chirality tags"
    primed from the atom parity properties. Once these are in place, we
    remove the conformer, and leave the mol in a state equivalent to one
    that came from a SMILES input.
    """
    try:
        conf = mol.GetConformer()
    except ValueError:
        pass
    else:
        if coords_all_zero(conf):
            Chem.AssignAtomChiralTagsFromMolParity(mol)
            mol.RemoveAllConformers()
    return mol


[docs]def assign_stereochemistry(mol):
    try:
        conf = mol.GetConformer()
    except ValueError:
        pass
    else:
        if not coords_all_zero(conf):
            # If we have all-zero coordinates, we don't care about
            # the 2D/3D flag (which is read, rather than checked)

            if conf.Is3D():
                # If we have 3D coordinates, calculate both chirality
                # and stereo bonds from them.
                Chem.AssignStereochemistryFrom3D(mol)
                return

            else:
                # If we have 2D coordinates available, use them
                # to refresh stereo bond directions
                Chem.DetectBondStereoChemistry(mol, conf)

    # The general case, valid both no conformation and calculate
    # stereochemistry from parity, and stereo bonds from bond directions.
    Chem.AssignStereochemistry(mol)


[docs]@audit_changes
def correct_sgroup_coordinates(mol):
    """
    If coordinates are generated, make sure the FIELDDISP property in the
    Sgroups are using relative coordinates.
    """
    s_groups = Chem.GetMolSubstanceGroups(mol)
    Chem.ClearMolSubstanceGroups(mol)
    for s_group in s_groups:
        properties = s_group.GetPropsAsDict()
        if s_group.HasProp("FIELDDISP"):
            new_fielddisp = re.sub(r'[0-9\- .]*D[AR]([U ])',
                                   r'    0.0       0.0       DR\1',
                                   s_group.GetProp("FIELDDISP"))
            s_group.SetProp("FIELDDISP", new_fielddisp)
        Chem.AddMolSubstanceGroup(mol, s_group)
    return mol


[docs]def preprocess_molblock(molblock: str, config: Optional[dict] = None) -> str:
    """
    Standardizes an MDL molblock

    :param molblock: input molblock
    :param config: dict specifying preprocessor options
    :return: standardized molblock
    """

    mol = convert.sdf_to_rdkit(molblock)
    options = PreprocessorOptions.fromConfig(config) if config else None
    out_mol = preprocess(mol, options)
    return convert_to_molblock(out_mol, options)


[docs]@rdkit_adapter.suppress_rdkit_log()
def preprocess(
        mol: Chem.rdchem.Mol,
        options: Optional[PreprocessorOptions] = None) -> Chem.rdchem.Mol:
    """
    Standardizes an RDKit mol

    :param mol: input mol
    :param options: preprocessor options
    :return: standardized mol
    """

    options = options or PreprocessorOptions()

    assert_not_blinded(mol, max_num_atoms=options.MAX_NUM_ATOMS)

    assert_not_query(mol)

    # Make sure the 3D flag matches the input coordinates before any Is3D calls
    try:
        conf = mol.GetConformer()
    except ValueError:
        pass
    else:
        has_3d_coords = any(xyz[2] != 0.0 for xyz in conf.GetPositions())
        if conf.Is3D() and not has_3d_coords:
            conf.Set3D(False)

    mol = setup_mol(mol)
    mol = check_kekulization(mol, options)

    # grab the initial MDL chiral flag value
    initial_chiral_flag = getprop(mol.GetUnsignedProp, MOL_PROP_CHIRAL_FLAG,
                                  MOL_PROP_OFF)

    # Pop so we can later reapply with update indices from transformations
    initial_stereo_groups = mol.GetStereoGroups()
    substance_groups = Chem.GetMolSubstanceGroups(mol)
    Chem.ClearMolSubstanceGroups(mol)

    if options.REMOVE_PROPERTIES:
        mol = remove_properties(mol)

    # This needs to happen before any transformations that might change any bond/atom indexes.
    original_bond_mapping = tag_mol_indexes(mol)

    # clarification as to why we're removing Hs when ADD_ALL is set:
    # if we are going to be adding Hs later anyway, we might as
    # well remove them now in order to simplify the rest of the process
    #
    # In order to maintain the current Hs, EXPLICIT_HYDROGENS should be
    #   set to "AS_IS"
    if not options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS:
        keep_wedged = options.EXPLICIT_HYDROGENS in (
            ExplicitHydrogens.KEEP_WEDGED,
            ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED)
        mol = remove_explicit_hydrogens(mol,
                                        substance_groups,
                                        keep_wedged=keep_wedged)

    if options.STRIP_SALTS:
        mol = strip_salts(mol, salt_list=options.STRIP_SALTS)

    if options.KEEP_ONLY_LARGEST_STRUCTURE:
        mol = remove_fragments(mol, substance_groups)

    if options.NEUTRALIZE:
        mol = neutralize(mol,
                         options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS)

    # Both transformations and tautomerization can "move" double bonds,
    # so we need to recalculate stereo if they cause any changes.
    # Also, tautomerization might reverse some of the transformations
    # we did, so run that first.
    if options.CHOOSE_CANONICAL_TAUTOMER:
        mol = generate_canonical_tautomer(mol)

    mol = apply_transformations(mol, options.TRANSFORMATIONS)

    # basically - who is doing the depiction - us or the input?
    coordgen_enabled = options.GENERATE_COORDINATES in (
        GenerateCoordinates.FULL, GenerateCoordinates.FULL_ALIGNED)
    # To be removed with LDIDEAS-5327; for now, force coordinate generation if
    # the property indicating this compound was sketched is present
    coordgen_enabled = getprop(mol.GetBoolProp,
                               value=LD_PROP_FORCE_COORDGEN,
                               default=coordgen_enabled)
    mol.ClearProp(LD_PROP_FORCE_COORDGEN)

    # Force coordgen if input is 3D
    try:
        coordgen_enabled = coordgen_enabled or mol.GetConformer().Is3D()
    except ValueError:
        pass

    if not coordgen_enabled:
        rdkit_extensions.reapply_molblock_wedging(mol)

    elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.ADD_ALL:
        mol = add_explicit_hydrogens(mol)

    elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED:
        mol = add_chiral_hs(mol)
        mol = add_explicit_hydrogens(mol, only_on_hetero=True)

    elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.KEEP_WEDGED:
        # if we are going to be setting the wedge orientation, we need
        # to add chiral Hs now, before coordinates are generated. Then
        # we set the bond wedging after generating coordinates
        mol = add_chiral_hs(mol)

    # This should happen after all transformations that can alter
    # atom and bond indexes. SGroups are reapplied onto the mol here.
    mol = update_mol_groups(mol, initial_stereo_groups, substance_groups,
                            original_bond_mapping)

    # After the call to update_mol_groups(), substance_groups are no longer
    # in sync with mol, and should not be used agin.
    del substance_groups

    if options.CLEAR_INVALID_WEDGE_BONDS:
        mol = clear_wedge_bonds_from_achiral_centers(mol)

    # Calculating the enhanced stereochemistry should happen
    # after clear_wedge_bonds_from_achiral_centers(), since it does
    # recalculate stereo, and we might find/remove some chiral centers
    mol = calculate_enhanced_stereo(mol, options.CHIRAL_FLAG_0_MEANING,
                                    initial_chiral_flag)

    if options.STRIP_AND_GROUPS_ON_SINGLE_ATOM:
        mol = strip_stereo_and(mol)

    if coordgen_enabled:
        align = options.GENERATE_COORDINATES is GenerateCoordinates.FULL_ALIGNED
        mol = generate_coordinates(mol, align)
        mol = correct_sgroup_coordinates(mol)

        # now that coordinates are set, we need to assign bond wedging
        mol = wedge_clean(mol)

        if options.EXPLICIT_HYDROGENS in (
                ExplicitHydrogens.KEEP_WEDGED,
                ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED):
            # Make sure we remove all non-wedged H again.
            # SGroups have been updated, so reread them.
            sgroups = Chem.GetMolSubstanceGroups(mol)
            keep_hetero = options.EXPLICIT_HYDROGENS == ExplicitHydrogens.ON_HETERO_AND_KEEP_WEDGED
            mol = remove_explicit_hydrogens(mol,
                                            sgroups,
                                            keep_wedged=True,
                                            keep_hetero=keep_hetero)

    mol = remove_wiggly_bonds_around_double_bonds(mol)

    return mol


[docs]class BlindedCompoundError(ValueError):
    pass


[docs]def assert_not_blinded(mol: Chem.rdchem.Mol,
                       max_num_atoms: Optional[int] = None):
    """
    Checks imcoming mol to confirm it has real atoms; if it doesn't it may
    have been intentionally stripped by the caller. LiveDesign marks these
    structures as having been "blinded", meaning a customer may have IP/legal
    restrictions, or there's a delay in registering the structure despite
    having assay data available. Currently, LiveDesign handles these structures
    by keeping a row in the LiveReport, but without an associated SDF or image.
    This is different from other registration errors, which are simply archived.

    :param mol: RDKit Mol to consider
    :param max_mol_wt: maximum allowed molecular weight
    """

    num_atoms = mol.GetNumAtoms()
    if num_atoms == 0:
        raise BlindedCompoundError("No atoms present")

    if all(atom.GetAtomicNum() == 0 for atom in mol.GetAtoms()):
        raise BlindedCompoundError("Only dummy atoms present")

    if max_num_atoms is not None and num_atoms > max_num_atoms:
        raise BlindedCompoundError(
            f"{num_atoms} atoms present; exceeds {max_num_atoms} atom limit")


[docs]def assert_not_query(mol: Chem.rdchem.Mol):
    """
    Checks incoming mol to confirm there are no query features present on atoms
    or bonds, which would otherwise make it not compatible with registration.

    :param mol: RDKit Mol to consider
    """
    if mol.HasProp(MOL_PROP_LINK_NODES):
        raise ValueError("Link node molecule query feature found")
    has_polymer = any(
        convert.is_polymer(sg) for sg in Chem.GetMolSubstanceGroups(mol))
    for bond in mol.GetBonds():
        if bond.HasProp(BOND_PROP_END_PTS):
            raise ValueError("Variable attachment bond found")
        if bond.HasQuery():
            raise ValueError("Bond query feature found")
    for atom in mol.GetAtoms():
        if atom.HasQuery():
            if atom.HasProp(MOL_PROP_R_LABEL):
                raise ValueError("Atom with R Label found")
            if not is_queryatom_exception(atom) and not has_polymer:
                raise ValueError("Atom query feature found")


[docs]def get_atoms_mapping(mol_atoms):
    atom_idx_mapping = {}
    for at in mol_atoms:
        try:
            atom_idx_mapping[at.GetUnsignedProp(
                ATOM_PROP_ORIGINAL_INDEX)] = at.GetIdx()
        except KeyError:
            pass
    return atom_idx_mapping


[docs]def get_bonds_mapping(mol, original_bond_mapping, atom_idx_mapping):
    bond_idx_mapping = {}
    for bnd_idx, bnd_atom_old_idxs in original_bond_mapping.items():
        try:
            bnd_atom_new_idxs = [
                atom_idx_mapping[old_idx] for old_idx in bnd_atom_old_idxs
            ]
        except KeyError:
            # Any of the atoms does not exist
            continue

        new_bnd = mol.GetBondBetweenAtoms(*bnd_atom_new_idxs)
        if new_bnd is None:
            # The atoms exist, but are no longer bonded
            continue

        bond_idx_mapping[bnd_idx] = new_bnd.GetIdx()
    return bond_idx_mapping


[docs]def tag_mol_indexes(mol):
    """
    Tag atoms with the initial indexes on the mol and create a bond mapping.
    Bonds are less stable than atoms, so we create an external mapping to
    the atoms they bind
    """
    for atom in mol.GetAtoms():
        atom.SetUnsignedProp(ATOM_PROP_ORIGINAL_INDEX, atom.GetIdx())

    bond_mapping = {
        bnd.GetIdx(): (bnd.GetBeginAtomIdx(), bnd.GetEndAtomIdx())
        for bnd in mol.GetBonds()
    }

    return bond_mapping


def _map_indexes(mapping, original_idxs):
    original_idxs = set(original_idxs)
    ret = set()
    for idx in original_idxs:
        try:
            new_idx = mapping[idx]
        except KeyError:
            pass
        else:
            ret.add(new_idx)
    return sorted(ret), original_idxs != ret


[docs]def check_attachment_points_changed(sg, atom_idx_mapping):
    for attachpt in sg.GetAttachPoints():
        # if aIdx has been removed, then the SGroup doesn't make sense,
        # and we should drop it
        try:
            new_aIdx = atom_idx_mapping[attachpt.aIdx]
        except KeyError:
            raise ValueError('Attach point invalidated')
        else:
            if new_aIdx != attachpt.aIdx:
                return True

        # lvIdx might be a H atom we made implicit
        try:
            new_lvIdx = atom_idx_mapping[attachpt.lvIdx]
        except KeyError:
            return True
        else:
            if new_lvIdx != attachpt.lvIdx:
                return True
    return False


[docs]def check_cstate_changed(sg, bond_idx_mapping):
    for cstate in sg.GetCStates():
        # Drop the SGroup if the CState bond has been removed.
        try:
            new_bond = bond_idx_mapping[cstate.bondIdx]
        except KeyError:
            raise ValueError('CState invalidated')
        else:
            if new_bond != cstate.bondIdx:
                return True

    return False


[docs]def update_sgroup_indexes(sg, sg_atoms, sg_parent_atoms, sg_bonds,
                          atom_idx_mapping, bond_idx_mapping):

    sg.SetAtoms(sg_atoms)
    sg.SetParentAtoms(sg_parent_atoms)
    sg.SetBonds(sg_bonds)

    attachpts = sg.GetAttachPoints()
    sg.ClearAttachPoints()
    for attachpt in attachpts:
        aIdx = atom_idx_mapping[attachpt.aIdx]

        # lvIdx might be a H atom we made implicit
        try:
            lvIdx = atom_idx_mapping[attachpt.lvIdx]
        except KeyError:
            lvIdx = 0

        sg.AddAttachPoint(aIdx, lvIdx, attachpt.id)

    cstates = sg.GetCStates()
    sg.ClearCStates()
    for cstate in cstates:
        bond_idx = bond_idx_mapping[cstate.bondIdx]
        sg.AddCState(bond_idx, cstate.vector)


[docs]def update_mol_groups(mol, stereo_groups, substance_groups,
                      original_bond_mapping):
    """
    Update atoms and bonds in stereo and substance groups, dropping any
    atoms/groups that are no longer valid for the current state of the mol.
    """

    if not stereo_groups and not substance_groups:
        return mol

    atom_idx_mapping = get_atoms_mapping(mol.GetAtoms())
    bond_idx_mapping = get_bonds_mapping(mol, original_bond_mapping,
                                         atom_idx_mapping)

    if stereo_groups:
        mol = update_stereo_groups(mol, stereo_groups, atom_idx_mapping)

    if substance_groups:
        mol = update_sgroups(mol, substance_groups, atom_idx_mapping,
                             bond_idx_mapping)

    return mol


[docs]@audit_changes
def update_sgroups(mol, substance_groups, atom_idx_mapping, bond_idx_mapping):
    """
    Update SGroups to reflect the transformations done on mol,
    updating with new atom and bond indexes, as well as atoms
    that might have been added or removed.
    """

    for sg in substance_groups:
        sg_atoms, atoms_changed = _map_indexes(atom_idx_mapping, sg.GetAtoms())
        sg_parent_atoms, parent_atoms_changed = _map_indexes(
            atom_idx_mapping, sg.GetParentAtoms())
        sg_bonds, bonds_changed = _map_indexes(bond_idx_mapping, sg.GetBonds())

        try:
            attp_changed = check_attachment_points_changed(sg, atom_idx_mapping)
            cstate_changed = check_cstate_changed(sg, bond_idx_mapping)
        except ValueError:
            continue

        # If any of the lists changed, we need to update the SGroup
        if any((atoms_changed, parent_atoms_changed, bonds_changed,
                attp_changed, cstate_changed)):

            # If we removed everything that the SGroup contained, then drop it.
            if len(sg_atoms) == len(sg_parent_atoms) == len(sg_bonds) == 0:
                continue

            # Add the SGroup to the updated mol so that checks are made against it
            sg = Chem.AddMolSubstanceGroup(mol, sg)
            update_sgroup_indexes(sg, sg_atoms, sg_parent_atoms, sg_bonds,
                                  atom_idx_mapping, bond_idx_mapping)
        else:
            Chem.AddMolSubstanceGroup(mol, sg)

    return mol


[docs]@audit_changes
def update_stereo_groups(mol, stereo_groups, atom_idx_mapping):

    rwmol = Chem.RWMol(mol)
    new_groups = []
    for group in stereo_groups:
        old_indexes = [at.GetIdx() for at in group.GetAtoms()]
        new_indexes, _ = _map_indexes(atom_idx_mapping, old_indexes)
        if new_indexes:
            new_groups.append(
                Chem.CreateStereoGroup(group.GetGroupType(), rwmol,
                                       new_indexes))

    rwmol.SetStereoGroups(new_groups)

    return rwmol.GetMol()


[docs]@audit_changes
def add_explicit_hydrogens(mol, only_on_hetero=False):
    if only_on_hetero:
        hetero_atoms = [
            at.GetIdx() for at in mol.GetAtoms() if at.GetAtomicNum() != 6
        ]
        return rdmolops.AddHs(mol,
                              onlyOnAtoms=hetero_atoms,
                              explicitOnly=False,
                              addCoords=True)
    return rdmolops.AddHs(mol, explicitOnly=False, addCoords=True)


[docs]@audit_changes
def remove_explicit_hydrogens(mol,
                              sgroups,
                              keep_wedged=False,
                              keep_hetero=False):
    # we use this to flag Hs that should not be removed.
    # there's no reasonable scenario in which an H atom
    # in a molecule could have an isotope value of 10, so
    # this should be safe
    save_isotope = 10
    ps = Chem.RemoveHsParameters()
    ps.updateExplicitCount = True
    ps.removeWithWedgedBond = not keep_wedged

    # Always remove double bond stereo Hs, except for imines,
    # which we handle below. In any other case, the double bond
    # stereo direction is either incorrect (e.g. 2 H neighbors),
    # or can be flipped to the other neighbor (in sdf, double bond
    # neighbors cannot be flagged to specifically carry the direction,
    # so we can't tell if a H is meant to be kept)
    ps.removeDefiningBondStereo = True

    saved_hs = False
    if keep_wedged:
        for atom in mol.GetAtoms():
            if atom.GetAtomicNum() == 1 and atom.GetIsotope() == 0:
                for bnd in atom.GetBonds():
                    # why this is as complex as it is:
                    # Information about bond wedging is stored as a bond property after the CTAB
                    # is parsed. Because the values have different meanings, the V2000 and V3000
                    # parsers use different property names.
                    # The values here correspond to bonds which are wedged, hashed, or "squiggled"
                    #  above comment valid for at least the 2020.03 RDKit release
                    if getprop(bnd.GetUnsignedProp, BOND_PROP_BOND_STEREO) in (1, 6, 4) or \
                        getprop(bnd.GetUnsignedProp,BOND_PROP_BOND_CONFIG) in (1, 2, 3):
                        atom.SetIsotope(save_isotope)
                        saved_hs = True
                        break

                    if keep_hetero and bnd.GetOtherAtom(
                            atom).GetAtomicNum() != 6:
                        atom.SetIsotope(save_isotope)
                        saved_hs = True
                        break

        # Keep imine stereogenic Hs to preserver stereo
        for imine_match in mol.GetSubstructMatches(IMINE_QUERY):
            if imine_match:
                _, n_atom, h_atom = imine_match
                bnd = mol.GetBondBetweenAtoms(n_atom, h_atom)
                if bnd.GetBondDir() == Chem.BondDir.ENDDOWNRIGHT or \
                        bnd.GetBondDir() == Chem.BondDir.ENDUPRIGHT:
                    mol.GetAtomWithIdx(h_atom).SetIsotope(save_isotope)
                    saved_hs = True

    # hydrogens that are an in an XBOND in a polymer sgroup should not be
    # removed, or else RDKit will remove the entire sgroup
    for sgroup in sgroups:
        if not convert.is_polymer(sgroup):
            continue
        for bnd_idx in sgroup.GetBonds():
            bnd = mol.GetBondWithIdx(bnd_idx)
            for atom in [bnd.GetBeginAtom(), bnd.GetEndAtom()]:
                if atom.GetAtomicNum() == 1 and atom.GetIsotope() == 0:
                    atom.SetIsotope(save_isotope)
                    saved_hs = True

    res = rdmolops.RemoveHs(mol, ps, sanitize=False)

    if saved_hs:
        # Isotopes were set on the input mol, and they are copied to the result mol.
        # We want the input to be unchanged, so we remove isotopes from both mols.
        for m in (res, mol):
            for atom in m.GetAtoms():
                if atom.GetAtomicNum() == 1 and atom.GetIsotope(
                ) == save_isotope:
                    atom.SetIsotope(0)

    return res


[docs]def convert_to_molblock(mol, options=None):
    """
    Convert processed mol into a molblock and make necessary updates.
    """
    processed_molblock = convert.rdkit_to_sdf(mol)

    options = options or PreprocessorOptions()
    if options.HEAVY_HYDROGEN_DT:
        processed_molblock = convert_heavy_hydrogens(processed_molblock)

    return processed_molblock


[docs]def convert_heavy_hydrogens(molblock):
    """
    NOTE that this operates on a molblock, not a molecule

    The RDKit does not currently (v2020.03) support writing D or T to
    mol blocks, so we need to post-process the text. Fortunately it's
    an easy regex in v3000 mol blocks. This does not work with V2000
    mol blocks, so we throw a ValueError there. This doesn't seem like
    a big deal since V2000 support is primarly being kept around for
    debugging purposes.
    If we need to eventually support V2000+HEAVY_HYDROGEN_DT, some
    not-completely-trivial code will need to be written.
    """
    d_expr = re.compile(r"^(M  V30 [0-9]+) H (.+?) MASS=2([^\n]*)",
                        flags=re.MULTILINE)
    out_molblock = d_expr.sub(r"\1 D \2\3", molblock)
    t_expr = re.compile(r"^(M  V30 [0-9]+) H (.+?) MASS=3([^\n]*)",
                        flags=re.MULTILINE)
    out_molblock = t_expr.sub(r"\1 T \2\3", out_molblock)
    return out_molblock


[docs]@audit_changes
def neutralize(mol, checkForProblematicHs=False):
    res = uncharger.uncharge(mol)
    res.UpdatePropertyCache(strict=False)

    if checkForProblematicHs:
        # NOTE the logic here really only applies to the organic subset.
        # it's not going to do well with other element types, but hopefully
        # no-one is doing neutralization on species with non-organic atoms
        # and EXPLICIT_HS set to "AS_IS"
        if res.GetNumAtoms() != mol.GetNumAtoms():
            raise RuntimeError("uncharging changed the number of atoms")
        for i in range(res.GetNumAtoms()):
            oAt = mol.GetAtomWithIdx(i)
            oChg = oAt.GetFormalCharge()
            if oChg > 0:
                nAt = res.GetAtomWithIdx(i)
                nChg = nAt.GetFormalCharge()
                if nChg != oChg:
                    # positive charges on atoms with Hs should have had an H removed, but
                    # we can't currently do that
                    if oAt.GetTotalNumHs(
                            includeNeighbors=True) - oAt.GetTotalNumHs():
                        raise ValueError(
                            "could not neutralize positive charged atom with explicit H neighbors"
                        )
    return res


[docs]def unicode_to_str(unicode_str):
    """
    Takes a unicode object and converts it to a str (utf-8).
    If the arg is already a str, returns unicode_str (i.e. if run with python 3).
    Needed to support python 2/3 with unicode_literals.

    py2: type<unicode> -> type<str utf-8>
    py3: type<str utf-8> (no unicode type exists)

    :type unicode_str: unicode (py2) or str (py3)
    :param unicode_str: the unicode that potentially needs converting (i.e. if run with python 2)

    :return: str
    """
    if isinstance(unicode_str, str):
        return unicode_str

    return unicode_str.encode("utf-8")


[docs]@audit_changes
def transform(mol, transformation):
    """
    apply the transformation to the molecule repeatedly until it no longer applies.

    the maxTransformations argument is just there to prevent us from ending up
    in an infinite loop due to a bogus transformation

    Please note that running transformations may alter the stereochemistry of mol,
    so a stereo recalculation from coordinates might be required.
    """
    max_transformations = 1000
    smarts = unicode_to_str(transformation)
    rxn = rdChemReactions.ReactionFromSmarts(smarts)
    for _ in range(max_transformations):
        output = rxn.RunReactants((mol,))
        if output:
            mol = output[0][0]
            mol.UpdatePropertyCache(strict=False)
        else:
            # When we get here, the reaction can no longer be applied
            # on whatever mol is at that moment, so don't try again
            break
    return mol


[docs]def in_xy_plane(mol):
    if mol.GetNumConformers() == 0:
        return False
    return not mol.GetConformer().Is3D()


[docs]@audit_changes
def generate_coordinates(mol, align=False):
    if mol.GetNumAtoms() < 2:
        # we don't really need to deal with coordgen if there's only a single atom
        mol.RemoveAllConformers()
        conf = Chem.Conformer(mol.GetNumAtoms())
        conf.Set3D(False)
        mol.AddConformer(conf)
        return mol

    # keep original mol to align with
    original_mol = Chem.Mol(mol)
    Chem.rdCoordGen.AddCoords(mol)

    # align input mol and mol with new coords
    if align and in_xy_plane(mol) and in_xy_plane(original_mol):
        rdMolAlign.AlignMol(mol, original_mol)

    # determine new coordinates of polymer brackets
    mol = convert.clean_up_polymer_brackets(mol, original_mol)

    return mol


def _find_matches(mol, st):
    """Which atoms match?"""
    # it's possible that atoms were added or deleted. We hope that this is only
    # hydrogens
    rdk_non_hydrogens = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() != 1)
    schro_non_hydrogens = sum(1 for a in st.atom if a.atomic_number != 1)
    if rdk_non_hydrogens != schro_non_hydrogens:
        msg = (f"Schrodinger structure doesn't match RDKit mol, they have "
               f"different numbers of non-hydrogen atoms (RDKit: "
               f"{rdk_non_hydrogens} != Schrodinger: {schro_non_hydrogens})")
        raise rdkit_adapter.InconsistentStructureError(msg)

    matches = {}
    for a in st.atom:
        try:
            idx = a.property[adapter.RDK_INDEX]
        except KeyError:
            continue
        rdkit_atom = mol.GetAtomWithIdx(idx)
        if rdkit_atom:
            matches[a] = rdkit_atom
    return matches


_BOND_TYPES_S2R = {
    structure.BondType.Single: Chem.BondType.SINGLE,
    structure.BondType.Double: Chem.BondType.DOUBLE,
    structure.BondType.Triple: Chem.BondType.TRIPLE,
    structure.BondType.Zero: Chem.BondType.ZERO,
    structure.BondType.Dative: Chem.BondType.DATIVE
}


def _move_bond_orders(st, editmol, matches):
    """Move bond orders and parity from Schrodinger to RDKit"""
    for bond in st.bond:
        a1, a2 = bond.atom1, bond.atom2
        rdk_a1 = matches.get(a1)
        rdk_a2 = matches.get(a2)
        if not rdk_a1 or not rdk_a2:
            continue
        rdbond = editmol.GetBondBetweenAtoms(rdk_a1.GetIdx(), rdk_a2.GetIdx())
        new_type = _BOND_TYPES_S2R[bond.type]
        if new_type != rdbond.GetBondType():
            rdbond.SetBondType(new_type)
            if new_type == Chem.BondType.SINGLE:
                # clear the bond direction if a double bond became a single bond
                rdbond.SetBondDir(Chem.BondDir.NONE)


[docs]def copy_lewis_structure_and_hydrogens(st, mol):
    """
    Applies bond orders and charges from st to mol. Updates #implicitH to match

    Assumes st includes all hydrogens. Adds implicit and explicit hydrogens to the
    mol, but does not add any graph hydrogens. May remove graph hydrogens.
    """
    editmol = Chem.RWMol(mol)
    matches = _find_matches(editmol, st)

    _move_bond_orders(st, editmol, matches)

    # Adjust charges - assumes that the mmct charge is good.
    for mmct_atom, rdkit_atom in matches.items():
        charge = mmct_atom.formal_charge
        if charge != rdkit_atom.GetFormalCharge():
            rdkit_atom.SetFormalCharge(charge)
    Chem.SanitizeMol(editmol, sanitizeOps=Chem.SANITIZE_ADJUSTHS)

    graph_hydrogens_to_delete = []
    for mmct_atom, rdkit_atom in matches.items():
        mmct_hydrogens = sum(
            (1 for a in mmct_atom.bonded_atoms if a.atomic_number == 1))
        mmct_hydrogens += mm.mmat_get_nhua(mmct_atom.atom_type)

        # hydrogen count is implicit + explicit (shown in smiles) + graph hydrogens
        rdk_hydrogens = rdkit_atom.GetTotalNumHs(includeNeighbors=True)

        # A hydrogen was added or removed on the Schrodinger side
        if rdk_hydrogens != mmct_hydrogens:
            new_explicit_h_count = rdkit_atom.GetNumExplicitHs(
            ) + mmct_hydrogens - rdk_hydrogens
            if new_explicit_h_count >= 0:
                rdkit_atom.SetNumExplicitHs(new_explicit_h_count)
            else:
                # Need to delete graph hydrogens
                hydrogens = [
                    a.GetIdx()
                    for a in rdkit_atom.GetNeighbors()
                    if a.GetAtomicNum() == 1
                ]
                remove_count = rdk_hydrogens - mmct_hydrogens
                assert len(hydrogens) >= remove_count
                for h_index in hydrogens[:remove_count]:
                    graph_hydrogens_to_delete.append(h_index)
    graph_hydrogens_to_delete.sort(reverse=True)
    for atom in graph_hydrogens_to_delete:
        editmol.RemoveAtom(atom)

    Chem.SanitizeMol(editmol, sanitizeOps=_SANITIZE_OPS)
    mol = editmol.GetMol()
    mol.UpdatePropertyCache(strict=False)

    return mol


[docs]@audit_changes
def generate_canonical_tautomer(mol):
    st = rdkit_adapter.from_rdkit(mol,
                                  include_properties=False,
                                  include_stereo=False)
    try:
        st = epik_tautomer_canonicalizer.TautomerCanonicalizer().apply(st)[0]
    except TautomerizerException as e:
        logger.warning(f"failed finding a canonical tautomer: {str(e)}")
        return mol
    return copy_lewis_structure_and_hydrogens(st, mol)


[docs]@audit_changes
def clear_wedge_bonds_from_achiral_centers(mol):
    modified = False
    new_mol = Chem.Mol(mol)
    Chem.AssignStereochemistry(new_mol, cleanIt=True, force=True)

    for orig_at, new_at in zip(mol.GetAtoms(), new_mol.GetAtoms()):
        if orig_at.GetChiralTag() != new_at.GetChiralTag():
            modified = True
            break
    if not modified:
        for orig_bnd, new_bnd in zip(mol.GetBonds(), new_mol.GetBonds()):
            if (orig_bnd.GetStereo() != new_bnd.GetStereo() or
                    orig_bnd.GetBondDir() != new_bnd.GetBondDir()):
                modified = True
                break
    return new_mol


[docs]@audit_changes
def calculate_enhanced_stereo(mol, enh_stereo_default_grouping,
                              initial_chiral_flag):
    # Place any chiral atom not in an enhanced stereo group into an enhanced
    # stereo group with type determined by ChiralFlag0Meaning
    enhanced_stereo_atoms = [
        at.GetIdx() for sg in mol.GetStereoGroups() for at in sg.GetAtoms()
    ]
    chiral_atoms = [
        at.GetIdx()
        for at in mol.GetAtoms()
        if at.GetChiralTag() in (Chem.ChiralType.CHI_TETRAHEDRAL_CCW,
                                 Chem.ChiralType.CHI_TETRAHEDRAL_CW) and
        not (at.GetIdx() in enhanced_stereo_atoms)
    ]

    if len(chiral_atoms) != 0:
        rwmol = Chem.RWMol(mol)
        stereo_groups = [sg for sg in rwmol.GetStereoGroups()]

        if initial_chiral_flag == MOL_PROP_ON or enh_stereo_default_grouping == ChiralFlag0Meaning.UNGROUPED_ARE_ABSOLUTE:
            for idx, sg in enumerate(stereo_groups):
                if sg.GetGroupType() == Chem.StereoGroupType.STEREO_ABSOLUTE:
                    # chiral atoms should be added to existing enhanced stereo group
                    chiral_atoms.extend([at.GetIdx() for at in sg.GetAtoms()])
                    stereo_groups.pop(idx)
                    break
            stereo_groups.append(
                Chem.CreateStereoGroup(Chem.StereoGroupType.STEREO_ABSOLUTE,
                                       rwmol, chiral_atoms))
        elif enh_stereo_default_grouping == ChiralFlag0Meaning.UNGROUPED_ARE_RACEMIC:
            stereo_groups.append(
                Chem.CreateStereoGroup(Chem.StereoGroupType.STEREO_AND, rwmol,
                                       chiral_atoms))
        else:
            stereo_groups.append(
                Chem.CreateStereoGroup(Chem.StereoGroupType.STEREO_OR, rwmol,
                                       chiral_atoms))

        rwmol.SetStereoGroups(stereo_groups)
        mol = rwmol.GetMol()

    # SHARED-8608: Write chiral flag as 0 in the preprocessor unless all
    # enhanced stereo groups are abs
    sg_is_abs = [
        sg.GetGroupType() == Chem.StereoGroupType.STEREO_ABSOLUTE
        for sg in mol.GetStereoGroups()
    ]
    if len(sg_is_abs) > 0 and all(sg_is_abs):
        mol.SetUnsignedProp(MOL_PROP_CHIRAL_FLAG, MOL_PROP_ON)
    else:
        mol.ClearProp(MOL_PROP_CHIRAL_FLAG)

    return mol


[docs]@audit_changes
def strip_stereo_and(input_mol):
    """
    Removes any Stereo AND groups with only one center and flattens the bonds around it

    :param input_mol: The original molecule to consider
    :return: post-processed molecule, if the input molecule was modified
    """
    mol = Chem.RWMol(input_mol)
    keep = []

    for stereo_group in mol.GetStereoGroups():
        if stereo_group.GetGroupType() != Chem.StereoGroupType.STEREO_AND:
            keep.append(stereo_group)
        elif len(stereo_group.GetAtoms()) > 1:
            keep.append(stereo_group)
        else:
            # flatten the atom, At this point the stereo_group only has one atom
            atom = stereo_group.GetAtoms()[0]
            atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED)
            # make sure that all bond wedging information has been removed around that atom too:
            for bond in atom.GetBonds():
                if bond.GetBeginAtomIdx() == atom.GetIdx():
                    bond.ClearProp(BOND_PROP_BOND_STEREO
                                  )  # what we'd get from a V2000 mol file
                    bond.ClearProp(BOND_PROP_BOND_CONFIG
                                  )  # what we'd get from a V3000 mol file

    mol.SetStereoGroups(keep)
    return Chem.Mol(mol)


[docs]def frag_is_smaller(atoms, largest_atoms, weight, largest_weight, smiles,
                    largest_smiles):
    """
    A fragment is considered larger if its atoms/weight are larger, the length of the smiles string
    is larger, or the smiles string is lexicographically *smaller* if they are equal length.
    ie, 'AAA' is larger than 'AAB'.. hence the final smiles > largest_smiles check here to reject
    """
    if atoms < largest_atoms:
        return True
    if atoms > largest_atoms:
        return False
    if weight < largest_weight:
        return True
    if weight > largest_weight:
        return False
    if len(smiles) < len(largest_smiles):
        return True
    if len(smiles) > len(largest_smiles):
        return False
    return smiles > largest_smiles


[docs]def connect_variable_attachment_points(mol):
    """
    forms zero-order bonds between one of the atoms of a bond with an ATTACH
    property to the "main" molecule in order to have the molecule+variable
    attachment point treated as a single fragment

    returns a 2-tuple with:
      1. the modified molecule
      2. whether or not the molecule was modified
    """
    bonds_added = False

    res = None
    for bond in list(mol.GetBonds()):
        if bond.HasProp("_MolFileBondAttach") and bond.HasProp(
                "_MolFileBondEndPts"):
            # these should look like: ENDPTS=(3 4 6 5)
            endp_txt = bond.GetProp("_MolFileBondEndPts")
            # do a bit of validation that the format is correct:
            if endp_txt[0] != "(" or endp_txt[-1] != ")":
                logger.warning("invalid ENDPTS format ignored")
                continue
            endps = [int(x) for x in endp_txt[1:-1].split(" ")]
            # do a bit of validation that the format is correct:
            if len(endps) != endps[0] + 1 or endps[0] == 1:
                logger.warning("invalid ENDPTS format ignored")
                continue
            batom_idx = bond.GetBeginAtomIdx()
            oatom_idx = endps[1] - 1
            if res is None:
                res = Chem.RWMol(mol)
            if res.GetBondBetweenAtoms(batom_idx, oatom_idx):
                logger.warning("variable attachment point already connected")
                continue
            res.AddBond(batom_idx, oatom_idx, Chem.BondType.ZERO)
            # set a property on the bond so that we can find it again:
            res.GetBondBetweenAtoms(batom_idx, oatom_idx).SetIntProp(
                "variable_attach_bond", 1)
            bonds_added = True
    if res is None:
        res = mol
    return res, bonds_added


[docs]@audit_changes
def remove_fragments(mol, substance_groups):
    """
    Fragments are not removed if the molecule contains any SGroups which are associated with
    polymers

    Use the following criteria to remove unwanted fragments from *mol*:
     1. keep only the fragment which has the most number of atoms
     2. break ties by keeping only fragments with the greatest molecular weight
     3. break ties with the longest smiles string
     4. break additional ties by keeping the fragment with the earliest alpha sorted SMILES string

    If two or more identical fragments remain after 1-4, we will throw a fatal error.
    """
    for sg in substance_groups:
        if getprop(sg.GetProp, "TYPE") in (
                "MUL",
                "SRU",
                "MON",
                "COP",
                "CRO",
                "GRA",
                "MIX",
                "FOR",
                "MOD",
        ):
            logger.debug(
                "Molecule has polymeric SGroups, skipping remove_fragments.")
            return mol

    nmol, bonds_added = connect_variable_attachment_points(mol)
    frags = Chem.GetMolFrags(nmol, asMols=True, sanitizeFrags=False)
    if len(frags) < 2:
        return mol

    largest_frag = None
    largest_atoms = math.inf
    largest_weight = math.inf
    largest_smiles = None
    duplicates = 0

    for frag in frags:
        [atoms, weight, smiles] = [
            frag.GetNumHeavyAtoms(),
            MolWt(frag),
            Chem.MolToSmiles(frag),
        ]
        if largest_frag is not None:
            if frag_is_smaller(atoms, largest_atoms, weight, largest_weight,
                               smiles, largest_smiles):
                continue

            # If all properties are equivalent, we have a *potential* duplicate of the largest known fragment
            # but if they are not, we've discovered a new, bigger fragment, and may discard any known dupes
            if (atoms == largest_atoms and weight == largest_weight and
                    smiles == largest_smiles):
                duplicates += 1
            else:
                duplicates = 0
            largest_frag = frag
            largest_atoms = atoms
            largest_weight = weight
            largest_smiles = smiles
        else:
            largest_frag = frag
            largest_atoms = atoms
            largest_weight = weight
            largest_smiles = smiles

    if duplicates:
        message = (
            "At the end of REMOVE_FRAGMENTS, we had {duplicates} identical "
            "fragments left over, returning only one.")
        logger.debug(message.format(duplicates=duplicates))

    if bonds_added:
        largest_frag = Chem.RWMol(largest_frag)
        bnds = list(largest_frag.GetBonds())
        for bnd in bnds:
            if bnd.HasProp("variable_attach_bond"):
                largest_frag.RemoveBond(bnd.GetBeginAtomIdx(),
                                        bnd.GetEndAtomIdx())

        largest_frag = largest_frag.GetMol()

    return largest_frag


[docs]@audit_changes
def remove_properties(mol):
    for prop_name in mol.GetPropNames():
        mol.ClearProp(prop_name)
    return mol


[docs]@audit_changes
def strip_salts(mol, salt_list):
    salt_list_str = "\n".join(salt_list)
    remover = SaltRemover.SaltRemover(defnData=salt_list_str)
    res = remover.StripMol(mol, sanitize=False)
    if res.GetNumAtoms() == 0:
        raise ValueError("Removing salts produced an empty structure")
    return res


[docs]def apply_transformations(mol, transformations):
    """
    Apply the given list of transformations, and recalculate stereo
    if at least one transformation applies.
    """

    for tform in transformations:
        mol = transform(mol, tform)

    # Remove properties leftover after RunReactants transformations
    for atom in mol.GetAtoms():
        for prop in (ATOM_PROP_MAP_NUMBER, "old_mapno", "react_atom_idx"):
            if atom.HasProp(prop):
                atom.ClearProp(prop)

    Chem.SanitizeMol(mol, sanitizeOps=_SANITIZE_OPS)
    # If any reactions applied, we need to rescan stereo from coordinates
    # because bond orders & stereo bonds might have changed
    assign_stereochemistry(mol)

    return mol


[docs]@audit_changes
def add_chiral_hs(mol):
    # need to filter chiral_atoms to get only atoms with endocyclic stereobonds
    # since we don't want to add hydrogens to chiral atoms that already have
    # exocyclic stereobonds
    chiral_atoms = [center[0] for center in Chem.FindMolChiralCenters(mol)]
    chiral_atoms_needing_hydrogens = []
    for atom_idx in chiral_atoms:
        atom = mol.GetAtomWithIdx(atom_idx)
        # only mark this for adding hydrogens if there are no exocyclic bonds
        add_atom = all(bond.IsInRing() for bond in atom.GetBonds())
        if add_atom:
            chiral_atoms_needing_hydrogens.append(atom_idx)

    # return mol if there are no chiral atoms to be adjusted
    if not chiral_atoms_needing_hydrogens:
        return mol

    return Chem.AddHs(
        mol,
        explicitOnly=False,
        addCoords=True,
        onlyOnAtoms=chiral_atoms_needing_hydrogens,
    )


[docs]@audit_changes
def wedge_clean(mol):
    if not mol.GetNumConformers():
        return mol

    # resetting the BondDir and UnknownStereo setting on each bond
    # so WedgeMolBonds can do its magic
    for bond in mol.GetBonds():
        if bond.GetBondType() == Chem.BondType.SINGLE and bond.IsInRing():
            if bond.GetBondDir() == Chem.BondDir.UNKNOWN:
                bond.SetIntProp(MOL_PROP_UNKNOWN_STEREO, MOL_PROP_ON)
            bond.SetBondDir(Chem.BondDir.NONE)
        # bonds which were explicitly marked as unknown:
        if getprop(bond.GetUnsignedProp, BOND_PROP_BOND_STEREO) == 4 or \
            getprop(bond.GetUnsignedProp, BOND_PROP_BOND_CONFIG) == 2:
            # ensure that the bond doesn't start at an atom with an specified double
            # bond
            disqualify = False
            for obond in bond.GetBeginAtom().GetBonds():
                if (obond.GetBondType() == Chem.BondType.DOUBLE and
                        obond.GetStereo() != Chem.BondStereo.STEREONONE):
                    disqualify = True
                    break
            if not disqualify:
                bond.SetBondDir(Chem.BondDir.UNKNOWN)

    Chem.WedgeMolBonds(mol, mol.GetConformer())
    return mol


[docs]@audit_changes
def remove_wiggly_bonds_around_double_bonds(mol):
    # We just need to make sure that there any wiggly bonds starting at atoms
    # involved in double bonds don't end up in the output
    for bond in mol.GetBonds():
        if bond.GetBondType() == Chem.BondType.SINGLE and (
                getprop(bond.GetUnsignedProp, BOND_PROP_BOND_STEREO) == 4 or
                getprop(bond.GetUnsignedProp, BOND_PROP_BOND_CONFIG) == 2):
            # it's a wiggly bond. Check to see if it starts at an atom with a double bond
            startsAtDouble = False
            for obond in bond.GetBeginAtom().GetBonds():
                if (obond.GetBondType() == Chem.BondType.DOUBLE and
                        obond.GetStereo() != Chem.BondStereo.STEREONONE):
                    startsAtDouble = True
                    break
            if startsAtDouble:
                if bond.HasProp(BOND_PROP_BOND_STEREO):
                    bond.ClearProp(BOND_PROP_BOND_STEREO)
                if bond.HasProp(BOND_PROP_BOND_CONFIG):
                    bond.ClearProp(BOND_PROP_BOND_CONFIG)
                bond.SetBondDir(Chem.BondDir.NONE)
    return mol