Source code for schrodinger.livedesign.convert

import enum
from typing import Generator
from typing import Union

import numpy as np
from rdkit import Chem
from rdkit import Geometry

from schrodinger import rdkit_extensions
from schrodinger.infra import structure as infrastructure
from schrodinger.rdkit import rdkit_adapter

ATOM_PROP_ATOM_LABEL = "atomLabel"
BOND_PROP_BOND_STEREO = "_MolFileBondStereo"
BOND_PROP_BOND_CONFIG = "_MolFileBondCfg"
MOL_PROP_ATTACHPT = "molAttchpt"
MOL_PROP_R_LABEL = "_MolFileRLabel"


[docs]class Format(enum.Enum):
    AUTO_DETECT = rdkit_extensions.Format.AUTO_DETECT
    SMILES = rdkit_extensions.Format.SMILES
    EXTENDED_SMILES = rdkit_extensions.Format.EXTENDED_SMILES
    SMARTS = rdkit_extensions.Format.SMARTS
    MDL_MOLV2000 = rdkit_extensions.Format.MDL_MOLV2000
    MDL_MOLV3000 = rdkit_extensions.Format.MDL_MOLV3000
    INCHI = rdkit_extensions.Format.INCHI
    INCHI_KEY = rdkit_extensions.Format.INCHI_KEY
    PDB = rdkit_extensions.Format.PDB
    # TODO: deprecate
    SDF = rdkit_extensions.Format.MDL_MOLV3000
    CXSMILES = rdkit_extensions.Format.EXTENDED_SMILES


[docs]def convert(data: str, input_format: Format, output_format: Format) -> str:
    """
    Main entrypoint for converting one serialization to another. A few
    convience functions are provided below for common LD conversions.

    :param data: input text string
    :param input_format: expected format for input string
    :param output_format: desired format for output string
    :return: converted text string
    """
    if output_format == Format.AUTO_DETECT:
        raise ValueError("Output format cannot be autodetected")

    if _is_reaction(data, input_format):
        return _from_rdkit_reaction(_to_rdkit_reaction(data,
                                                       format=input_format),
                                    format=output_format)
    else:
        return _from_rdkit_mol(_to_rdkit_mol(data, format=input_format),
                               format=output_format)


[docs]def sdf_to_rdkit(
    molblock: str
) -> Union[Chem.rdchem.Mol, Chem.rdChemReactions.ChemicalReaction]:
    """
    :param molblock: given MDL or MDL RXN
    :return: corresponding RDKit mol or reaction
    """
    if _is_reaction(molblock, Format.MDL_MOLV3000):
        return _to_rdkit_reaction(molblock, format=Format.MDL_MOLV3000)
    else:
        return _to_rdkit_mol(molblock, format=Format.MDL_MOLV3000)


[docs]def rdkit_to_sdf(mol: Chem.rdchem.Mol) -> str:
    """
    :param input: given RDKit mol
    :return: corresponding sdf molblock
    """
    return _from_rdkit_mol(mol, format=Format.MDL_MOLV3000)


[docs]def rdkit_to_cxsmiles(mol: Chem.rdchem.Mol) -> str:
    """
    :param mol: given RDKit mol
    :return: corresponding extended SMILES stripped of coordinates
    """
    return _from_rdkit_mol(mol, format=Format.EXTENDED_SMILES)


[docs]def get_sd_reader(molblocks: str) -> Generator[Chem.rdchem.Mol, None, None]:
    """
    :param molblocks: string of sdf molblocks
    :return: generator that provides mols from sdf molblcoks
    """
    with infrastructure.TextBlockReader(
            molblocks, infrastructure.FileFormat.SD) as text_reader:
        for molblock in text_reader:
            molblock = molblock.decode()
            # Fix in SHARED-8782: white space after sdf delimiter causes
            # TextBlockReader to find an extra text block
            if molblock.strip() == '$$$$':
                continue
            yield _to_rdkit_mol(molblock, Format.MDL_MOLV3000)


[docs]def atom_has_attachment_point(atom):
    return atom.HasProp(ATOM_PROP_ATOM_LABEL) and atom.GetProp(
        ATOM_PROP_ATOM_LABEL).startswith("_AP")


def _is_reaction(data: str, format: Format) -> bool:
    if format == Format.MDL_MOLV3000:
        return data.startswith("$RXN")
    elif format in (Format.SMILES, Format.EXTENDED_SMILES, Format.SMARTS):
        return ">>" in data
    else:
        return False  # Unsupported reaction format


def _to_rdkit_mol(data: str,
                  format: Format = Format.AUTO_DETECT) -> Chem.rdchem.Mol:

    with rdkit_adapter.convert_log_to_exception():
        mol = rdkit_extensions.text_to_rdmol(data, format.value)

        # partial sanitization to preserve input molecule
        ops = (Chem.SANITIZE_ALL ^ Chem.SANITIZE_CLEANUP ^
               Chem.SANITIZE_PROPERTIES ^ Chem.SANITIZE_KEKULIZE ^
               Chem.SANITIZE_FINDRADICALS ^ Chem.SANITIZE_CLEANUPCHIRALITY)
        Chem.SanitizeMol(mol, sanitizeOps=ops)
        mol.UpdatePropertyCache(strict=False)

        if format == Format.MDL_MOLV3000:
            mol = clean_up_polymer_brackets(mol, keep_existing_brackets=True)

        return mol


def _from_rdkit_mol(mol: Chem.rdchem.Mol, format: Format) -> str:
    if format == Format.MDL_MOLV3000:
        Chem.rdmolops.KekulizeIfPossible(mol, clearAromaticFlags=True)
    return rdkit_extensions.rdmol_to_text(mol, format.value)


def _to_rdkit_reaction(
    data: str,
    format: Format = Format.AUTO_DETECT
) -> Chem.rdChemReactions.ChemicalReaction:
    return rdkit_extensions.text_to_reaction(data, format.value)


def _from_rdkit_reaction(rxn: Chem.rdChemReactions.ChemicalReaction,
                         format: Format) -> str:
    if format == Format.INCHI:
        raise RuntimeError("Cannot convert reaction to INCHI format")
    return rdkit_extensions.reaction_to_text(rxn, format.value)


def _detect_unkekulized_atoms(m):
    opts = Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES ^ Chem.SANITIZE_CLEANUP ^ Chem.SANITIZE_CLEANUPCHIRALITY ^ Chem.SANITIZE_FINDRADICALS
    with rdkit_adapter.suppress_rdkit_log():
        errs = Chem.DetectChemistryProblems(m, opts)
    for err in errs:
        if err.GetType() == 'KekulizeException':
            return sorted(err.GetAtomIndices())
    return None


[docs]def add_hs_to_aromatic_nitrogen(mol):
    """
    Intended to be used with molecules which have kekulization failures due to
    aromatic system(s) containing Ns where the user hasn't provided the
    location of the implicit Hs in the system.

    This picks an arbitrary (but canonical) aromatic N to add an H to in each
    aromatic system.

    Returns the original mol if it can't fix it.
    """
    Chem.GetSymmSSSR(mol)
    tm = Chem.Mol(mol)
    unkekulized_atoms = _detect_unkekulized_atoms(tm)
    while unkekulized_atoms:
        tm, new_unkekulized_atoms = _fix_ring_system(tm, unkekulized_atoms)
        if new_unkekulized_atoms == unkekulized_atoms:
            # got stuck, return the original
            return mol
        unkekulized_atoms = new_unkekulized_atoms

    return tm


def _fix_ring_system(m, unkekulized_atoms):
    m.UpdatePropertyCache(False)

    # we want to always generate the same result for the same molecule
    # Which result to return will cause arguments, yay, but we'll
    # prefer smaller rings, then use the canonical atom ranking to break ties
    ranks = Chem.CanonicalRankAtoms(m)
    ri = m.GetRingInfo()
    atoms_to_check = unkekulized_atoms[:]
    atoms_to_check.sort(key=lambda x: (ri.MinAtomRingSize(x), ranks[x]))
    while atoms_to_check:
        ai = atoms_to_check.pop(0)
        atom = m.GetAtomWithIdx(ai)
        if atom.GetAtomicNum() != 7:
            continue
        if atom.GetDegree() != 2 or atom.GetNumExplicitHs() != 0:
            continue

        tm = Chem.Mol(m)
        atom = tm.GetAtomWithIdx(ai)
        atom.SetNumExplicitHs(1)
        atom.SetNoImplicit(True)
        # did we fix something?
        new_unkekulized_atoms = _detect_unkekulized_atoms(tm)
        if new_unkekulized_atoms != unkekulized_atoms:
            return tm, new_unkekulized_atoms
    return m, unkekulized_atoms


def _get_perpendicular_bracket(start_xyz, end_xyz):
    """
    Get coordinates for a bracket bisecting bond X. The bracket length will be
    the same length as the bond that it bisects.
    """
    xy_change = start_xyz - end_xyz
    opposite_reciprocal = np.array([xy_change[1] / 2, (xy_change[0] * -1) / 2])
    midpoint = np.array([(start_xyz[0] + end_xyz[0]) / 2,
                         (start_xyz[1] + end_xyz[1]) / 2])
    bracket = [
        Geometry.Point3D(*(midpoint + opposite_reciprocal), 0),
        Geometry.Point3D(*(midpoint - opposite_reciprocal), 0),
        Geometry.Point3D(0, 0, 0)
    ]
    return bracket


[docs]def is_polymer(s_group):
    if not s_group.HasProp("TYPE"):
        raise ValueError(
            f"Substance group {s_group.GetIndexInMol()} does not have the TYPE property"
        )
    return s_group.GetProp("TYPE") in [
        "SRU", "MON", "COP", "CRO", "GRA", "MOD", "MER", "ANY"
    ]


[docs]def clean_up_polymer_brackets(mol,
                              revert_to_mol=None,
                              keep_existing_brackets=False):
    """
    Add polymer brackets back to mol.

    :param mol: RDKit mol to add polymer brackets to
    :param revert_to_mol: RDKit mol to revert to if polymer brackets cannot be
        added correctly to provided mol. This will occur when brackets cross
        more than one bond.
    :param keep_existing_brackets: whether to recalculate the positions of
        brackets that are already present
    :return: RDKit mol with polymer brackets
    """
    if not revert_to_mol:
        revert_to_mol = Chem.Mol(mol)

    substance_groups = Chem.GetMolSubstanceGroups(mol)
    Chem.ClearMolSubstanceGroups(mol)
    pos = mol.GetConformer().GetPositions()
    revert = False

    for s_group in substance_groups:
        # We can only calculate brackets for non-empty polymer groups. If that
        # is not the case, either keep the existing brackets, or clear them.
        if len(s_group.GetAtoms()) == 0 or not is_polymer(s_group):
            if not keep_existing_brackets:
                s_group.ClearBrackets()
            Chem.AddMolSubstanceGroup(mol, s_group)
            continue

        # do not recalculate coordinates for already existing brackets
        if keep_existing_brackets and len(s_group.GetBrackets()) != 0:
            Chem.AddMolSubstanceGroup(mol, s_group)
            continue

        bond_positions = [(pos[mol.GetBondWithIdx(bnd).GetBeginAtomIdx()],
                           pos[mol.GetBondWithIdx(bnd).GetEndAtomIdx()])
                          for bnd in s_group.GetBonds()]
        s_group.ClearBrackets()
        if (len(bond_positions) == 0):
            # put brackets around all substance group atoms
            atom_pos = [pos[a] for a in s_group.GetAtoms()]
            offset_from_atom = .5
            min_x = min([xyz[0] for xyz in atom_pos]) - offset_from_atom
            min_y = min([xyz[1] for xyz in atom_pos]) - offset_from_atom
            max_x = max([xyz[0] for xyz in atom_pos]) + offset_from_atom
            max_y = max([xyz[1] for xyz in atom_pos]) + offset_from_atom
            left_bracket = [
                Geometry.Point3D(min_x, min_y, 0),
                Geometry.Point3D(min_x, max_y, 0),
                Geometry.Point3D(0, 0, 0),
            ]
            right_bracket = [
                Geometry.Point3D(max_x, min_y, 0),
                Geometry.Point3D(max_x, max_y, 0),
                Geometry.Point3D(0, 0, 0),
            ]
            s_group.AddBracket(left_bracket)
            s_group.AddBracket(right_bracket)
        elif (len(bond_positions) == 2):
            s_group.AddBracket(
                _get_perpendicular_bracket(bond_positions[0][0],
                                           bond_positions[0][1]))
            s_group.AddBracket(
                _get_perpendicular_bracket(bond_positions[1][0],
                                           bond_positions[1][1]))
        else:
            revert = True
            break
        Chem.AddMolSubstanceGroup(mol, s_group)
    if revert:
        # when brackets have to go through 2 or more bonds, it is unlikely that
        # the new coordinates will allow a good placement of the polymer
        # brackets
        mol = Chem.Mol(revert_to_mol)
    return mol