Source code for schrodinger.application.scaffold_enumeration.rdcml

'''
Basic CML reader: http://www.xml-cml.org/
'''

import contextlib
import json
import xml.etree.ElementTree as ET
from collections import defaultdict

from rdkit import Chem

from . import common

#------------------------------------------------------------------------------#

# Map of CML bond orders to RDKit bond types.
_CML_BOND_ORDER_TO_RDK = {
    '1' : Chem.BondType.SINGLE,
    'S' : Chem.BondType.SINGLE,
    '2' : Chem.BondType.DOUBLE,
    'D' : Chem.BondType.DOUBLE,
    '3' : Chem.BondType.TRIPLE,
    'T' : Chem.BondType.TRIPLE,
    'A' : Chem.BondType.AROMATIC
} # yapf: disable

# CML atom attributes *not* to be set as RDKit atom properties.
_SPECIAL_CML_ATOM_PROPS = {
    'elementType',
    'formalCharge',
    'x2', 'y2',
    'x3', 'y3', 'z3'
} # yapf: disable

# CML bond attributes *not* to be set as RDKit bond properties.
_SPECIAL_CML_BOND_PROPS = {
    'atomRefs2',
    'order'
} # yapf: disable


#------------------------------------------------------------------------------#


[docs]def parse_and_drop_xmlns(source): ''' Parses `source` via `xml.etree.ElementTree.iterparse`, and removes namespace prefix (if present) from tags and attributes. :param source: File name or file-like object. :type source: str or file-like :return: Root of the tree. :rtype: `xml.etree.ElementTree.Element` ''' it = ET.iterparse(source) # from https://stackoverflow.com/a/33997423 for _, el in it: if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] # strip all namespaces # strip namespaces of attributes too for at in list(el.attrib.keys()): if '}' in at: newat = at.split('}', 1)[1] el.attrib[newat] = el.attrib[at] del el.attrib[at] return it.root
#------------------------------------------------------------------------------# def _get_xml_atom_coordinates(xml_atom): ''' Returns `xml_atom` coordinates or None. :param xml_atom: CML atom element. :type xml_atom: `xml.etree.ElementTree.Element` :return: Cartesian coordinates or None. :rtype: [float, float, float] or None ''' # catch `TypeError` because xml.etree.ElementTree.Element.get() # does not raise KeyError -- it returns None instead try: return [float(xml_atom.get(c)) for c in ('x3', 'y3', 'z3')] except TypeError: pass try: return [float(xml_atom.get(c)) for c in ('x2', 'y2')] + [0.0] except TypeError: return None #------------------------------------------------------------------------------# def _make_rdk_atom_from_xml_atom(xml_atom, prop_prefix=common.CML_PROP_PREFIX): ''' Instantiates `rdkit.Chem.Atom` from the XML element data. :param xml_atom: XML element that represents CML atom. :type xml_atom: `xml.etree.ElementTree.Element` :param prop_prefix: Prefix to be added to the CML property names. :type prop_prefix: str :return: RDKit atom and its coordinates. :rtype: rdkit.Chem.Atom, [float, float, float] or None ''' element = xml_atom.get('elementType') if not element or element.upper() in ('R', 'X'): element = '*' rdk_atom = Chem.Atom(element) try: rdk_atom.SetFormalCharge(int(xml_atom.get('formalCharge', ''))) except ValueError: pass for (k, v) in xml_atom.attrib.items(): if k not in _SPECIAL_CML_ATOM_PROPS: rdk_atom.SetProp(prop_prefix + k, v) return rdk_atom, _get_xml_atom_coordinates(xml_atom) #------------------------------------------------------------------------------# def _decorate_rdk_bond_from_xml_bond(rdk_bond, xml_bond, prop_prefix=common.CML_PROP_PREFIX): ''' Propagates data from the XML element that represents CML bond to the RDKit bond. :param rdk_bond: RDKit bond. :type rdk_bond: rdkit.Chem.Bond :param xml_bond: XML element that represents CML bond. :type xml_bond: `xml.etree.ElementTree.Element` :param prop_prefix: Prefix to be added to the CML property names. :type prop_prefix: str ''' try: rdk_bond.SetBondType(_CML_BOND_ORDER_TO_RDK[xml_bond.get('order')]) except KeyError: pass for bs in xml_bond.findall('./bondStereo'): stereo = bs.text.strip().upper() if stereo == 'W': rdk_bond.SetBondDir(Chem.BondDir.BEGINWEDGE) elif stereo == 'H': rdk_bond.SetBondDir(Chem.BondDir.BEGINDASH) for (k, v) in xml_bond.attrib.items(): if k not in _SPECIAL_CML_BOND_PROPS: rdk_bond.SetProp(prop_prefix + k, v) #------------------------------------------------------------------------------# def _adapt_enhanced_stereo(mol, prop_prefix): ''' Translates "enhanced stereochemistry" data from MRV-specific properties into RDKit conventions. :param mol: R/W molecule. :type mol: `rdkit.Chem.RWMol` :param prop_prefix: Prefix used for the CML property names. :type prop_prefix: str ''' group_id_prop = prop_prefix + 'mrvStereoGroup' groups = defaultdict(list) for atom in mol.GetAtoms(): try: group_id = atom.GetProp(group_id_prop) groups[group_id].append(atom.GetIdx()) except (KeyError, ValueError): pass rdk_stereo_groups = [] for group_id, atom_indices in groups.items(): if group_id.startswith('or'): rdk_sg_type = Chem.StereoGroupType.STEREO_OR elif group_id.startswith('and'): rdk_sg_type = Chem.StereoGroupType.STEREO_AND elif group_id.startswith('abs'): rdk_sg_type = Chem.StereoGroupType.STEREO_ABSOLUTE else: continue rdk_stereo_groups.append( Chem.CreateStereoGroup(rdk_sg_type, mol, atom_indices)) mol.SetStereoGroups(rdk_stereo_groups) for rsg in rdk_stereo_groups: for atom in rsg.GetAtoms(): atom.ClearProp(group_id_prop) #------------------------------------------------------------------------------#
[docs]def rdk_mol_from_cml_element(xml_molecule, prop_prefix=common.CML_PROP_PREFIX): ''' Instantiates `rdkit.Chem.Mol` from the XML element. :param xml_molecule: XML element that represents CML molecule. :type xml_molecule: `xml.etree.ElementTree.Element` :param prop_prefix: Prefix to be added to the CML property names. :type prop_prefix: str :return: RDKit molecule. :rtype: rdkit.Chem.ROMol ''' rwmol = Chem.RWMol(Chem.Mol()) # atoms xml_atoms = xml_molecule.findall('./atomArray/atom') rdk_atoms_and_coordinates = (_make_rdk_atom_from_xml_atom( e, prop_prefix=prop_prefix) for e in xml_atoms) atom_id2idx = dict() # ID -> RDKit index atom_id_property = prop_prefix + 'id' coordinates = [] for rdk_atom, crd in rdk_atoms_and_coordinates: rdk_index = rwmol.AddAtom(rdk_atom) if rdk_atom.HasProp(atom_id_property): atom_id2idx[rdk_atom.GetProp(atom_id_property)] = rdk_index if crd: coordinates.append(crd) # bonds for xml_bond in xml_molecule.findall('./bondArray/bond'): atomRefs = xml_bond.get('atomRefs2').split() assert len(atomRefs) == 2 atom1_idx = atom_id2idx[atomRefs[0]] atom2_idx = atom_id2idx[atomRefs[1]] num_bonds = rwmol.AddBond(atom1_idx, atom2_idx) rdk_bond = rwmol.GetBondWithIdx(num_bonds - 1) _decorate_rdk_bond_from_xml_bond(rdk_bond, xml_bond, prop_prefix=prop_prefix) _adapt_enhanced_stereo(rwmol, prop_prefix=prop_prefix) mol = rwmol.GetMol() if len(coordinates) == mol.GetNumAtoms(): conformer = Chem.Conformer(len(coordinates)) for (i, xyz) in enumerate(coordinates): conformer.SetAtomPosition(i, xyz) mol.AddConformer(conformer) Chem.AssignChiralTypesFromBondDirs(mol) Chem.DetectBondStereochemistry(mol) # molecules (S-groups) sgroups = [] for xml_sgroup in xml_molecule.findall('./molecule'): sgroups.append({k: v for (k, v) in xml_sgroup.attrib.items()}) if sgroups: mol.SetProp(prop_prefix + 'sgroups', json.dumps(sgroups)) return mol
#------------------------------------------------------------------------------#
[docs]class CmlFileReader(contextlib.AbstractContextManager): ''' Does not need to be context manager (such need may arise in case we switch to a different XML parser). '''
[docs] def __init__(self, filename, prop_prefix=common.CML_PROP_PREFIX): self._filename = filename self._prop_prefix = prop_prefix
def __enter__(self): root = parse_and_drop_xmlns(self._filename) self._xml_molecules = root.findall( './MDocument/MChemicalStruct/molecule') # reverse the list to .pop() from it later self._xml_molecules.reverse() return self def __exit__(self, *exc_details): return None def __iter__(self): return self def __next__(self): try: xml_molecule = self._xml_molecules.pop() except IndexError: raise StopIteration return rdk_mol_from_cml_element(xml_molecule)
#------------------------------------------------------------------------------#