Source code for schrodinger.application.scaffold_enumeration.markush

from collections import defaultdict
import re

from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdqueries

from . import common

_WAS_DUMMY = 'was_dummy'  # RDKit native
_RLABEL_PROP = '_MolFileRLabel'  # RDKit native
_ATOM_LABEL_PROP = 'atomLabel'  # RDKit native
_REACT_ATOM_IDX_PROP = 'react_atom_idx'  # RDKit native
_ACTIVE_PLACEHOLDER_PROP = '_activePlaceHolder'
_BOND_ID_PROP = '_bond_id'


def _get_rlabel(atom):
    """
    Returns atom's R-label or zero.

    :param atom: Atom.
    :type atom: rdkit.Chem.Atom

    :return: R-label (positive integer) or zero.
    :rtype: int
    """

    try:
        return atom.GetUnsignedProp(_RLABEL_PROP)
    except KeyError:
        return 0


def _set_rlabel(atom, rlabel):
    """
    Sets atom's R-label if `rlabel` is positive, clears it otherwise.

    :param atom: Atom.
    :type atom: rdkit.Chem.Atom

    :param rlabel: R-label.
    :type rlabel: int
    """

    if rlabel > 0:
        atom.SetUnsignedProp(_RLABEL_PROP, rlabel)
    else:
        atom.ClearProp(_RLABEL_PROP)


def _clear_attachment_order(atom):
    """
    Clears attachment order atom properties.

    :param atom: Atom.
    :type atom: rdkit.Chem.Atom
    """

    atom.ClearProp(common.CML_ATTACHMENT_ORDER_PROP)
    try:
        label = atom.GetProp(_ATOM_LABEL_PROP)
        if label.startswith('_AP'):
            atom.ClearProp(_ATOM_LABEL_PROP)
    except KeyError:
        pass


def _get_attachment_order(atom):
    """
    Obtains attachment order from `atom` properties.

    :param atom: Atom.
    :type atom: rdkit.Chem.Atom

    :return: Attachment order or None.
    :rtype: Optional[int]
    """

    try:
        return atom.GetIntProp(common.CML_ATTACHMENT_ORDER_PROP)
    except KeyError:
        pass

    try:
        label = atom.GetProp(_ATOM_LABEL_PROP)
        if label.startswith('_AP'):
            return int(label[3:])
    except (KeyError, ValueError):
        pass

    return None


def _is_attachment_point(atom):
    """
    :param atom: Atom.
    :type atom: rdkit.Chem.Atom
    """

    is_dummy = atom.GetAtomicNum() == 0
    is_degree_one = atom.GetDegree() == 1

    return is_dummy and is_degree_one and _get_rlabel(atom) == 0


def _get_attachment_points(mol):
    """
    Gathers "attachment point" atoms in `mol`. If all
    "attachment orders" are available, sort atoms accordingly.

    :param mol: Molecule.
    :type mol: rdkit.Chem.ROMol

    :return: List of "attachment point" atoms and
        "attachment orders" availability.
    :rtype: tuple[list[rdkit.Chem.Atom], bool]
    """

    attachment_points = []

    for atom in mol.GetAtoms():
        if _is_attachment_point(atom):
            attachment_points.append((_get_attachment_order(atom), atom))

    have_attachment_orders = all(o is not None for (o, _) in attachment_points)

    if have_attachment_orders:
        attachment_points.sort(key=lambda x: x[0])

    return [atom for (_, atom) in attachment_points], have_attachment_orders


def _propagate_properties(product, ratom, rgroup):
    """
    Helper function called from `_place_rgroup`.

    :param product: Product molecule obtained via RDKit "chemical reaction".
    :type product: rdkit.Chem.Mol

    :param ratom: "placeholder" (R-group) atom from the scaffold molecule.
    :type ratom: rdkit.Chem.Atom

    :param rgroup: R-group molecule.
    :type rgroup: rdkit.Chem.Mol
    """

    scaffold = ratom.GetOwningMol()

    def scaffold_atom(atom):
        try:
            react_atom_idx = atom.GetProp(_REACT_ATOM_IDX_PROP)
            return scaffold.GetAtomWithIdx(int(react_atom_idx))
        except (KeyError, ValueError):
            return None

    for atom in product.GetAtoms():
        # prefer R-group references from `scaffold`
        # over the ones from the `rgroup`
        sc_atom = scaffold_atom(atom)
        if sc_atom and atom.GetAtomicNum() == 0:
            _set_rlabel(atom, _get_rlabel(sc_atom))

        # drop "attachment orders" inherited from from `rgroup`
        if not sc_atom or atom.HasProp(_WAS_DUMMY):
            _clear_attachment_order(atom)
        atom.ClearProp(_WAS_DUMMY)

        if not sc_atom:
            continue

        # bond IDs and types
        for bond in atom.GetBonds():
            neigh = bond.GetOtherAtom(atom)
            sc_neigh = scaffold_atom(neigh)
            if sc_neigh and neigh.GetIdx() > atom.GetIdx():
                # unaffected scaffold bond
                sc_bond = scaffold.GetBondBetweenAtoms(sc_atom.GetIdx(),
                                                       sc_neigh.GetIdx())
                try:
                    bond.SetProp(common.CML_ID_PROP,
                                 sc_bond.GetProp(common.CML_ID_PROP))
                except KeyError:
                    pass
            elif not sc_neigh:
                # bond to R-group in the scaffold
                try:
                    bond.SetProp(common.CML_ID_PROP,
                                 atom.GetProp(_BOND_ID_PROP))
                    atom.ClearProp(_BOND_ID_PROP)
                except KeyError:
                    pass
                sc_bond = scaffold.GetBondBetweenAtoms(ratom.GetIdx(),
                                                       sc_atom.GetIdx())
                bond.SetBondType(sc_bond.GetBondType())


def _transfer_stereo_groups(src, dest):
    """
    Add the stereo groups from `src` to `dest`. The atom indices in `src` MUST
    also apply to `dest`; this works for example when `dest` was built by
    appending atoms to a copy of `src`, as is the case in _place_rgroup(). This
    function is a workaround because the RDKit reaction machinery presently does
    not copy stereo groups from the product template to the generated product.

    :param Chem.Mol src: source molecule
    :param Chem.Mol dest: destination molecule (not modified)
    :return: modified copy of `dest`
    :rtype: Chem.Mol
    """
    new_mol = Chem.RWMol(dest)
    new_groups = list(new_mol.GetStereoGroups())
    for group in src.GetStereoGroups():
        new_group = Chem.CreateStereoGroup(
            group.GetGroupType(), new_mol,
            [atom.GetIdx() for atom in group.GetAtoms()])
        new_groups.append(new_group)
    new_mol.SetStereoGroups(new_groups)
    return new_mol.GetMol()


def _place_rgroup(atom, rgroup):
    """
    Replaces `atom` with `rgroup`.

    :param atom: Atom.
    :type atom: rdkit.Chem.Atom

    :param rgroup: R-group molecule.
    :type rgroup: rdkit.Chem.Mol

    :return: List of molecules (which may be empty).
    :rtype: list(rdkit.Chem.Mol)
    """

    rgroup_copy = Chem.Mol(rgroup)
    for rgatom in rgroup_copy.GetAtoms():
        rgatom.SetAtomMapNum(0)
        rgatom.ClearProp(_WAS_DUMMY)
        rgatom.ClearProp(_REACT_ATOM_IDX_PROP)

    attachment_points, have_attachment_orders = \
        _get_attachment_points(rgroup_copy)

    if len(attachment_points) != atom.GetDegree():
        return []

    bonds = []
    for b in atom.GetBonds():
        try:
            bond_id = b.GetProp(common.CML_ID_PROP)
            bonds.append((bond_id, b))
        except KeyError:
            break
    else:
        # Sorting bonds by bond ID. If all bond IDs can be converted to an
        # integer, then they will be numerically sorted. As a fallback, a
        # string sort is performed. Note that MdlFileReader gives bond IDs with
        # a 'b' prepended to an integer; these will be converted to ints.
        sort_order = {}
        for id_str, _ in bonds:
            if m := re.match(r'^b?(\d+)$', id_str):
                sort_order[id_str] = int(m.group(1))
            else:
                # Bond ID not convertible to int. Fall back to string sort.
                bonds.sort(key=lambda x: x[0])
                break
        else:
            bonds.sort(key=lambda x: sort_order[x[0]])

    have_bond_ids = len(bonds) == atom.GetDegree()

    # create "reactant" query that matches the `atom`; require
    # "bond priorities" to match "attachment orders" in case both
    # are available, and bond orders are consistent; match only
    # bond orders (from `rgroup`) otherwise

    match_ids = have_bond_ids and have_attachment_orders

    if match_ids:
        # plexus-like matching that relies on labels;
        # check whether bond orders are consistent
        for (ap, (_, b)) in zip(attachment_points, bonds):
            # `ap` has exactly one bond
            if b.GetBondType() != ap.GetBonds()[0].GetBondType():
                match_ids = False
                break

    # "reactant" template
    reactant = Chem.RWMol()

    # R-group (placeholder) atom
    r_atom = rdqueries.HasIntPropWithValueQueryAtom(_ACTIVE_PLACEHOLDER_PROP,
                                                    atom.GetIdx())
    r_atom_index = reactant.AddAtom(r_atom)

    # R-group (placeholder) atom bonds
    for i, ap in enumerate(attachment_points, 1):
        ap.ClearProp(_BOND_ID_PROP)
        ap.SetAtomMapNum(i)
        r_neighbor = Chem.AtomFromSmarts('[*]')
        r_neighbor.SetAtomMapNum(i)
        r_neighbor_index = reactant.AddAtom(r_neighbor)
        if match_ids:
            # Plexus-like, match "bond_id"
            bond_id = bonds[i - 1][0]
            bond_query = rdqueries.HasStringPropWithValueQueryBond(
                common.CML_ID_PROP, bond_id)
            r_num_bonds = reactant.AddBond(r_atom_index, r_neighbor_index)
            reactant.ReplaceBond(r_num_bonds - 1, bond_query)
            # Store "bond_id" as "dummy" atom property
            # because RDKit reactions do not propagate bond
            # properties, but do copy over atom properties (at
            # least to some extent)
            ap.SetProp(_BOND_ID_PROP, bond_id)
        else:
            # match attachment point bond order
            reactant.AddBond(r_atom_index, r_neighbor_index,
                             ap.GetBonds()[0].GetBondType())

    # mark the `atom` temporarily
    atom.SetIntProp(_ACTIVE_PLACEHOLDER_PROP, atom.GetIdx())

    try:
        if not atom.GetOwningMol().HasSubstructMatch(reactant):
            # ENUM-290: favor scaffold bond orders
            for bond in reactant.GetBonds():
                bond.SetBondType(Chem.BondType.UNSPECIFIED)

        rxn = rdChemReactions.ChemicalReaction()
        rxn.AddReactantTemplate(reactant)
        rxn.AddProductTemplate(rgroup_copy)

        rdChemReactions.UpdateProductsStereochemistry(rxn)

        products = [m for (m,) in rxn.RunReactant(atom.GetOwningMol(), 0)]
    finally:
        atom.ClearProp(_ACTIVE_PLACEHOLDER_PROP)

    products = [_transfer_stereo_groups(rgroup, p) for p in products]

    for p in products:
        _propagate_properties(p, atom, rgroup_copy)

    return products


def _get_r2p_map(mol):
    """
    Assumes that `mol` is a product of "chemical reaction". Returns
    a dictionary that maps "reactant" atom indices onto atom indices
    in the `mol`.

    :param mol: Molecule.
    :type mol: rdkit.Chem.Mol

    :return: Reactant-to-product atom index map.
    :rtype: dict(int, int)
    """

    outcome = dict()
    for atom in mol.GetAtoms():
        try:
            react_atom_idx = atom.GetProp(_REACT_ATOM_IDX_PROP)
            outcome[int(react_atom_idx)] = atom.GetIdx()
        except (KeyError, ValueError):
            pass

    return outcome


[docs]def place_rgroups(mol, atom_indices_and_rgroups, appended_rgroups=None): """ Generator that yields realizations of `mol` with (some) atoms replaced by R-groups. :param mol: Molecule. :type mol: rdkit.Chem.Mol :param atom_indices_and_rgroups: List of atom indices paired with corresponding R-groups. :type atom_indices_and_rgroups: list(int, rdkit.Chem.Mol) :param appended_rgroups: List of appended rgroups so far. :rtype appended_rgroups: list(rdkit.Chem.Mol) """ if appended_rgroups is None: appended_rgroups = [] mol_name = mol.GetProp('_Name') if mol.HasProp('_Name') else 'scaffold' if atom_indices_and_rgroups: curr = atom_indices_and_rgroups[0] atom, rgroup = mol.GetAtomWithIdx(curr[0]), curr[1] for p in _place_rgroup(atom, rgroup): # propagate name from mol to products returned by _place_rgroup p.SetProp('_Name', mol_name) mol2p = _get_r2p_map(p) todo = [(mol2p[i], rg) for (i, rg) in atom_indices_and_rgroups[1:]] # track added rgroup(s) appended_rgroups.append(rgroup) yield from place_rgroups(p, todo, appended_rgroups) # remove added rgroup(s) appended_rgroups.pop() else: for index, rgroup in enumerate(appended_rgroups, 1): cxsmiles = Chem.MolToCXSmiles(rgroup) mol.SetProp(f's_rge_R{index}_cxsmiles', cxsmiles) rgroup_id = rgroup.GetProp('_Name') if rgroup.HasProp( '_Name') else '' mol.SetProp(f's_rge_R{index}', rgroup_id) mol_name += f'_{rgroup_id}' # naming scheme: {scaffold_name}_{rgroup1}_{rgroup2} etc mol.SetProp('_Name', mol_name) yield mol
[docs]def canonicalize_R_labels(mol): """ Translates different conventions of R-group labelling into the RDKit "native" (AtomRLabel). :param mol: Molecule. :type mol: rdkit.Chem.Mol """ for atom in mol.GetAtoms(): if _get_rlabel(atom) > 0: continue try: _set_rlabel(atom, int(atom.GetProp(common.CML_RGROUP_REF_PROP))) continue except (KeyError, ValueError): pass try: label = atom.GetProp('atomLabel') if label.startswith('_R'): _set_rlabel(atom, int(label[2:])) except (KeyError, ValueError): pass
[docs]def get_rlabels_set(mol): """ Returns set of R-labels carried by the atoms in the `mol`. :param mol: Molecule. :type mol: rdkit.Chem.Mol :return: Set or R-labels. :rtype: set(int) """ labels = [_get_rlabel(atom) for atom in mol.GetAtoms()] return {r for r in labels if r > 0}
[docs]def get_rlabels_map(mol): """ Returns map from R-labels to atom indices. :param mol: Molecule. :type mol: rdkit.Chem.Mol :return: Map from R-labels to atom indices. :rtype: dict of int:list(int) """ outcome = defaultdict(list) for atom in mol.GetAtoms(): label = _get_rlabel(atom) if label > 0: outcome[label].append(atom.GetIdx()) return outcome