Source code for schrodinger.application.scaffold_enumeration.mdl

import contextlib
import itertools
import re

from rdkit import Chem

from . import atomlist
from . import common
from . import cxsmiles
from . import markush


def _translate_mdl_atomlist_atoms(mol):
    '''
    Replaces MDL "atom list" query atoms with regular (non-query) atoms.

    :param mol: Molecule to be massaged.
    :type mol: `rdkit.Chem.rdchem.RWMol`
    '''

    pt = Chem.GetPeriodicTable()

    regular_atoms = []
    for atom in mol.GetAtoms():
        if atom.HasQuery():
            smarts = atom.GetSmarts()
            if re.match(r'\[(#\d+,)*#\d+\]', smarts):
                elements = [
                    int(match.group(1))
                    for match in re.finditer(r'#(\d+)[,\]]', smarts)
                ]
                regular_atom = Chem.Atom(elements[0])
                atomlist.set_atom_elements(regular_atom,
                                           map(pt.GetElementSymbol, elements))
                regular_atoms.append((atom.GetIdx(), regular_atom))

    for (idx, atom) in regular_atoms:
        mol.ReplaceAtom(idx, atom)


def _collect_posvar_bonds(mol):
    '''
    Collect "multi-center groups" (for the "position variation bonds")
    assuming that `mol` originated from MDL format. Replace groups "center"
    query atoms with regular atoms.

    :param mol: Molecule to be processed.
    :type mol: `rdkit.Chem.rdchem.RWMol`

    :return: List of named tuples that describe the "multi-center groups"
        recognized within `mol`.
    :rtype: list(.cxsmiles.MCG)
    '''

    outcome = []

    # position-variant bonds
    for bond in mol.GetBonds():
        try:
            text = bond.GetProp('_MolFileBondEndPts')
        except KeyError:
            continue
        if not re.match(r'\((\d+\s+)+\d+\)', text):
            continue
        # identify the "center" (dummy) atom
        for atom in (bond.GetBeginAtom(), bond.GetEndAtom()):
            if atom.HasQuery() and atom.GetSmarts() == '*':
                dummy = atom
                break
        else:
            continue

        atoms = [int(w) - 1 for w in text[1:-1].split()]
        outcome.append(cxsmiles.MCG(center=dummy.GetIdx(), atoms=atoms[1:]))
        mol.ReplaceAtom(dummy.GetIdx(), Chem.Atom(0))

    return outcome


def _collect_repeating_units(mol):
    '''
    Collect "repeating units" from the "substance groups" associated with `mol`.
    Side effect: removes substance groups from the `mol`.

    :param mol: Molecule to be inspected.
    :type mol: `rdkit.Chem.rdchem.RWMol`

    :return: List of named tuples that describe the "repeating units".
    :rtype: list(.cxsmiles.SRU)
    '''

    outcome = []

    for group in Chem.GetMolSubstanceGroups(mol):
        try:
            if group.GetProp('TYPE') != 'SRU':
                continue
            subscript = group.GetProp('LABEL')
            superscript = group.GetProp('CONNECT').lower()
        except KeyError:
            continue
        outcome.append(
            cxsmiles.SRU(atoms=list(group.GetAtoms()),
                         subscript=subscript,
                         superscript=superscript))

    Chem.ClearMolSubstanceGroups(mol)

    return outcome


[docs]def translate_mdl_enumerable_features(mol, prop_prefix=common.CML_PROP_PREFIX): ''' Translates metadata that pertains to the "enumerable features" from the convetions assumed by the RDKit SDMolSupplier to the form expected by this package. :param mol: Molecule with enumerable features in MDL "language". :type mol: `rdkit.Chem.rdchem.ROMol` :return: Adapted molecule. :rtype: `rdkit.Chm.rdchem.RWMol` ''' out = Chem.RWMol(mol) # R-labels markush.canonicalize_R_labels(out) rlabels = markush.get_rlabels_map(out) # convert R-group placeholders into the regular (non-query) atoms for idx in itertools.chain.from_iterable(rlabels.values()): atom = Chem.Atom(0) out.ReplaceAtom(idx, atom, preserveProps=True) # "atom lists", "position variation bonds" and "repeating units" # https://docs.chemaxon.com/display/docs/markush-features.md _translate_mdl_atomlist_atoms(out) mcgs = _collect_posvar_bonds(out) srus = _collect_repeating_units(out) # assign IDs to bonds/atoms id_prop = prop_prefix + 'id' for atom in out.GetAtoms(): atom.SetProp(id_prop, f'a{atom.GetIdx() + 1}') for bond in out.GetBonds(): bond.SetProp(id_prop, f'b{bond.GetIdx() + 1}') if mcgs or srus: out.SetProp(prop_prefix + 'sgroups', cxsmiles._mcgs_and_srus_as_rdcml_json(out, mcgs, srus)) return out.GetMol()
[docs]class MdlFileReader(contextlib.AbstractContextManager):
[docs] def __init__(self, filename, prop_prefix=common.CML_PROP_PREFIX): self._supplier = Chem.SDMolSupplier(filename) self._prop_prefix = prop_prefix
def __enter__(self): return self def __exit__(self, *exc_details): return None def __iter__(self): return self def __next__(self): while True: if raw := next(self._supplier): break return translate_mdl_enumerable_features(raw, self._prop_prefix)