Source code for schrodinger.application.scaffold_enumeration.mdl

import contextlib
import itertools
import re

from rdkit import Chem

from . import atomlist
from . import common
from . import cxsmiles
from . import markush


def _translate_mdl_atomlist_atoms(mol):
    '''
    Replaces MDL "atom list" query atoms with regular (non-query) atoms.

    :param mol: Molecule to be massaged.
    :type mol: `rdkit.Chem.rdchem.RWMol`
    '''

    pt = Chem.GetPeriodicTable()

    regular_atoms = []
    for atom in mol.GetAtoms():
        if atom.HasQuery():
            smarts = atom.GetSmarts()
            if re.match(r'\[(#\d+,)*#\d+\]', smarts):
                elements = [
                    int(match.group(1))
                    for match in re.finditer(r'#(\d+)[,\]]', smarts)
                ]
                regular_atom = Chem.Atom(elements[0])
                atomlist.set_atom_elements(regular_atom,
                                           map(pt.GetElementSymbol, elements))
                regular_atoms.append((atom.GetIdx(), regular_atom))

    for (idx, atom) in regular_atoms:
        mol.ReplaceAtom(idx, atom)


def _collect_posvar_bonds(mol):
    '''
    Collect "multi-center groups" (for the "position variation bonds")
    assuming that `mol` originated from MDL format. Replace groups "center"
    query atoms with regular atoms.

    :param mol: Molecule to be processed.
    :type mol: `rdkit.Chem.rdchem.RWMol`

    :return: List of named tuples that describe the "multi-center groups"
        recognized within `mol`.
    :rtype: list(.cxsmiles.MCG)
    '''

    outcome = []

    # position-variant bonds
    for bond in mol.GetBonds():
        try:
            text = bond.GetProp('_MolFileBondEndPts')
        except KeyError:
            continue
        if not re.match(r'\((\d+\s+)+\d+\)', text):
            continue
        # identify the "center" (dummy) atom
        for atom in (bond.GetBeginAtom(), bond.GetEndAtom()):
            if atom.HasQuery() and atom.GetSmarts() == '*':
                dummy = atom
                break
        else:
            continue

        atoms = [int(w) - 1 for w in text[1:-1].split()]
        outcome.append(cxsmiles.MCG(center=dummy.GetIdx(), atoms=atoms[1:]))
        mol.ReplaceAtom(dummy.GetIdx(), Chem.Atom(0))

    return outcome


def _collect_repeating_units(mol):
    '''
    Collect "repeating units" from the "substance groups" associated with `mol`.
    Side effect: removes substance groups from the `mol`.

    :param mol: Molecule to be inspected.
    :type mol: `rdkit.Chem.rdchem.RWMol`

    :return: List of named tuples that describe the "repeating units".
    :rtype: list(.cxsmiles.SRU)
    '''

    outcome = []

    for group in Chem.GetMolSubstanceGroups(mol):
        try:
            if group.GetProp('TYPE') != 'SRU':
                continue
            subscript = group.GetProp('LABEL')
            superscript = group.GetProp('CONNECT').lower()
        except KeyError:
            continue
        outcome.append(
            cxsmiles.SRU(atoms=list(group.GetAtoms()),
                         subscript=subscript,
                         superscript=superscript))

    Chem.ClearMolSubstanceGroups(mol)

    return outcome


[docs]def translate_mdl_enumerable_features(mol, prop_prefix=common.CML_PROP_PREFIX):
    '''
    Translates metadata that pertains to the "enumerable features"
    from the convetions assumed by the RDKit SDMolSupplier to the
    form expected by this package.

    :param mol: Molecule with enumerable features in MDL "language".
    :type mol: `rdkit.Chem.rdchem.ROMol`

    :return: Adapted molecule.
    :rtype: `rdkit.Chm.rdchem.RWMol`
    '''

    out = Chem.RWMol(mol)

    # R-labels
    markush.canonicalize_R_labels(out)
    rlabels = markush.get_rlabels_map(out)

    # convert R-group placeholders into the regular (non-query) atoms
    for idx in itertools.chain.from_iterable(rlabels.values()):
        atom = Chem.Atom(0)
        out.ReplaceAtom(idx, atom, preserveProps=True)

    # "atom lists", "position variation bonds" and "repeating units"
    # https://docs.chemaxon.com/display/docs/markush-features.md

    _translate_mdl_atomlist_atoms(out)

    mcgs = _collect_posvar_bonds(out)
    srus = _collect_repeating_units(out)

    # assign IDs to bonds/atoms
    id_prop = prop_prefix + 'id'
    for atom in out.GetAtoms():
        atom.SetProp(id_prop, f'a{atom.GetIdx() + 1}')
    for bond in out.GetBonds():
        bond.SetProp(id_prop, f'b{bond.GetIdx() + 1}')

    if mcgs or srus:
        out.SetProp(prop_prefix + 'sgroups',
                    cxsmiles._mcgs_and_srus_as_rdcml_json(out, mcgs, srus))

    return out.GetMol()


[docs]class MdlFileReader(contextlib.AbstractContextManager):

[docs]    def __init__(self, filename, prop_prefix=common.CML_PROP_PREFIX):
        self._supplier = Chem.SDMolSupplier(filename)
        self._prop_prefix = prop_prefix

    def __enter__(self):
        return self

    def __exit__(self, *exc_details):
        return None

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            if raw := next(self._supplier):
                break
        return translate_mdl_enumerable_features(raw, self._prop_prefix)