Source code for schrodinger.rdkit.filtering

"""
PathFinder functions for filtering and for computing descriptors, including
similarity metrics.
"""

import functools
import heapq
import re
import sys

import networkx
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Fingerprints import FingerprintMols

import schrodinger.application.canvas.base as canvas
from schrodinger.rdkit import molio
from schrodinger.structutils import analyze
from schrodinger.structutils import filter
from schrodinger.rdkit import rdkit_adapter
from schrodinger.ui.qt.filter_dialog_dir import filter_core
from schrodinger.utils import log

logger = log.get_output_logger("pathfinder")

DESCRIPTORS_DICT = dict(Descriptors.descList)

# rdMolDescriptors that only take a Mol object
DESCRIPTORS_DICT['NumSpiroAtoms'] = rdMolDescriptors.CalcNumSpiroAtoms
DESCRIPTORS_DICT['NumBridgeheadAtoms'] = rdMolDescriptors.CalcNumBridgeheadAtoms

AlogPTyper = canvas.ChmAtomTyperAlogP()


[docs]def NumChiralCenters(mol):
    """
    Wrapper for Chem.FindMolChiralCenters, so we can include it in
    DESCRIPTORS_DICT and pretend it's an RDKit descriptor.

    :type mol: Mol
    :rtype: int
    """
    return len(Chem.FindMolChiralCenters(mol, includeUnassigned=True))


DESCRIPTORS_DICT['NumChiralCenters'] = NumChiralCenters


[docs]def TPSAIncludeSandP(mol):
    """
    The TPSA including S and P.

    :type mol: Chem.Mol
    :rtype: float
    """
    return Chem.rdMolDescriptors.CalcTPSA(mol, force=True, includeSandP=True)


DESCRIPTORS_DICT['TPSAIncludeSandP'] = TPSAIncludeSandP

# atomic number and correction value for the MW corrected for the McGowan volume
# values based on Lobell, M. et al. In silico ADMET traffic lights as a tool for
# the prioritization of HTS hits. ChemMedChem (2006) 1, 1229-36.
_MW_CORRECTION_DICT = {9: -13.77, 17: -16.23, 35: -53.67, 53: -89.54}


[docs]def MVCorrMW(mol):
    """
    The Molecular weight corrected for McGowan Volume, so it can be added
    to the DESCRIPTORS_DICT and pretend it is an RDKit descriptor.

    :type mol: Chem.Mol
    :rtype: float
    """
    mw = Chem.Descriptors.MolWt(mol)
    for a in mol.GetAtoms():
        mw += _MW_CORRECTION_DICT.get(a.GetAtomicNum(), 0)
    return mw


DESCRIPTORS_DICT['MVCorrMW'] = MVCorrMW


@functools.lru_cache(maxsize=1)
def _get_crb_path_lengths(mol):
    """
    Determine the lengths of consecutive rotor systems in a molecule.

    :type mol: Chem.Mol
    :rtype: tuple of int
    """
    EXCLUDE_ROTOR_ATOM = 's_user_exclude_rotor_atom'
    EXCLUDED_ROTOR = {
        '[#6](=O)[#7][#6,#7]': [1, 3],  # amide exclude C and N
        '[#6](=O)[OX2][#6]': [1, 3],  # ester exclude C and single bonded O
        '[#6][SX4](=O)(=O)[#7]': [2, 5],  # CSO2N exclude S and N
    }

    st = rdkit_adapter.from_rdkit(mol)

    # Annotate atoms for rotors that should not be considered
    for pattern in EXCLUDED_ROTOR:
        for match in analyze.evaluate_smarts_canvas(st, pattern):
            for pos_index, at_num in enumerate(match, start=1):
                if pos_index in EXCLUDED_ROTOR[pattern]:
                    st.atom[at_num].property[EXCLUDE_ROTOR_ATOM] = pattern

    # Build a graph for the connection table
    st_graph = networkx.Graph()
    for rotor in analyze.rotatable_bonds_iterator(st):
        if all(st.atom[at_idx].property.get(EXCLUDE_ROTOR_ATOM)
               for at_idx in rotor):
            continue
        st_graph.add_edge(*rotor)

    # Report on the non-degenerate consecutive rotor paths.
    uniq_paths = networkx.connected_components(st_graph)
    return sorted((len(path) - 1 for path in uniq_paths), reverse=True)


[docs]def LongestConsecutiveRBCount(mol):
    """
    The maximum number of consecutive rotatable bonds in the molecule.

    :type mol: Chem.Mol
    :rtype: int
    """
    return max(_get_crb_path_lengths(mol), default=0)


DESCRIPTORS_DICT['LongestConsecutiveRBCount'] = LongestConsecutiveRBCount


[docs]def MeanConsecutiveRBCount(mol):
    """
    The mean value of consecutive rotatable bonds in the molecule.

    :type mol: Chem.Mol
    :rtype: int
    """
    crb_list = _get_crb_path_lengths(mol)
    return sum(crb_list) / (len(crb_list) or 1)


DESCRIPTORS_DICT['MeanConsecutiveRBCount'] = MeanConsecutiveRBCount


[docs]def LargestRingAtomCount(mol):
    """
    The size of the largest ring in mol.

    :type mol: Chem.Mol
    :rtype: int
    """
    atom_rings = mol.GetRingInfo().AtomRings()
    return max((len(x) for x in atom_rings), default=0)


DESCRIPTORS_DICT['LargestRingAtomCount'] = LargestRingAtomCount


# cache this so consecutive calls with the same mol will avoid re-computation
# when a AlogP and logS are both needed
[docs]@functools.lru_cache(maxsize=1)
def AlogP(mol):
    """
    The AlogP for the molecule as computed by canvas.

    The canvas calculation is based on J. Phys. Chem A 1998, 102, 3762-3772.
    Prediction of Hydrophobic (Lipophilic) Properties of Small Organic
    Molecules. Using Fragmental Methods: An Analysis of ALOGP and CLOGP Methods.
    A. K. Ghose et al.

    :type mol: Chem.Mol
    :rtype: float
    """
    mol = canvas.ChmMol.fromSMILES(Chem.MolToSmiles(mol))
    add_implicit_hydrogen = True
    return AlogPTyper.calculateScalar(mol, add_implicit_hydrogen)


DESCRIPTORS_DICT['AlogP'] = AlogP


[docs]def NumSAtoms(mol):
    """
    The number of S atoms in the molecule.

    :type mol: Chem.Mol
    :rtype: int
    """

    sulfur_query = Chem.MolFromSmarts('[#16]')
    return len(mol.GetSubstructMatches(sulfur_query))


DESCRIPTORS_DICT['NumSAtoms'] = NumSAtoms


[docs]def NumAlkyne(mol):
    """
    The number of alkynes in the molecule.

    :type mol: Chem.Mol
    :rtype: int
    """
    alkyne_query = Chem.MolFromSmarts('C#C')
    return len(mol.GetSubstructMatches(alkyne_query))


DESCRIPTORS_DICT['NumAlkyne'] = NumAlkyne


[docs]def FractionAromatic(mol):
    """
    The fraction of heavy atoms in the molecule that are aromatic atoms.

    :type mol: Chem.Mol
    :rtype: float
    """
    return len(mol.GetAromaticAtoms()) / mol.GetNumHeavyAtoms()


DESCRIPTORS_DICT['FractionAromatic'] = FractionAromatic


[docs]def logS(mol):
    """
    Calculate ESOL based on descriptors in the Delaney paper, coefficient
    refit for RDKit.

    :type mol: Chem.Mol
    :rtype: float
    """
    value = 0.26121066137801696
    for coeff, prop_key in (
        (-0.0066138847738667125, 'MolWt'),
        (-0.7416739523408995, 'AlogP'),
        (0.003451545565957996, 'NumRotatableBonds'),
        (-0.42624840441316975, 'FractionAromatic'),
    ):
        value += coeff * DESCRIPTORS_DICT[prop_key](mol)
    return value


DESCRIPTORS_DICT['logS'] = logS


[docs]def NumNonSpiroCR4(mol):
    """
    The number of non spiro quaternary carbons attached to C, N, O or S.

    :type mol: Chem.Mol
    :rtype: int
    """

    query = Chem.MolFromSmarts('[C;!R2](-[#6,#7,#8,#16])(-[#6,#7,#8,#16])'
                               '(-[#6,#7,#8,#16])-[#6,#7,#8,#16]')
    return len(mol.GetSubstructMatches(query))


DESCRIPTORS_DICT['NumNonSpiroCR4'] = NumNonSpiroCR4


[docs]def NumFiveFiveRings(mol):
    """
    The number of edge fused 5 atom ring pairs.

    :type mol: Chem.Mol
    :return: int
    """
    query = Chem.MolFromSmarts('*~1~*~*~*~2~*~*~*~*~1~2')
    return len(mol.GetSubstructMatches(query))


DESCRIPTORS_DICT['NumFiveFiveRings'] = NumFiveFiveRings

# Prefix for similarity property names
SIMILARITY_BASE = 'r_reaction_similarity'

DEFAULT_DESCRIPTORS = \
    'MolLogP,MolWt,NumChiralCenters,NumHAcceptors,NumHDonors,TPSA'


[docs]class JSONFilterAdapter(object):
    """
    An adapter to make a
    schrodinger.ui.qt.filter_dialog_dir.filter_core.Filter look
    like a schrodinger.structutils.filter.Filter.
    """

[docs]    def __init__(self, filename):
        """
        Create a filter object given a JSON filename.

        :type filename: str
        """
        self.filter_obj = filter_core.Filter.fromFile(filename)

[docs]    def filter(self, mols):
        """
        A generator that yields only the mols that pass the filter conditions.

        :type structures: iterable of rdkit.Chem.Mol
        """
        for mol in mols:
            props = rdkit_adapter.translate_rdkit_props_dict(
                mol.GetPropsAsDict())
            if self.filter_obj.doPropsMatch(props):
                yield mol
            else:
                logger.debug(f'Filtered out product; props: {props}')

[docs]    def getPropertyNames(self):
        """
        Return the set of properties used by all the filters in this object.

        :rtype: set of str
        """
        return {c.prop for c in self.filter_obj.criteria}


[docs]class ComparableMol:
    """
    A simple wrapper for a Mol object, adding the __lt__ operator so the mols
    can be put in a heap or sorted. Any double property may be used as the key.
    """

[docs]    def __init__(self, mol, prop, reverse=False):
        """
        :param mol: molecule
        :type mol: rdkit.Chem.rdchem.Mol

        :param prop: key property name
        :type prop: str

        :param reverse: if true, invert the the comparison (use > instead of <)
        :type reverse: bool
        """
        self.mol = mol
        self.prop = prop
        self.reverse = reverse

    def __lt__(self, other):
        x = self.mol.GetDoubleProp(self.prop)
        y = other.mol.GetDoubleProp(self.prop)
        if self.reverse:
            return x > y
        else:
            return x < y


[docs]def keep_top_n(products, nkeep, nsample, prop, reverse=False):
    """
    A generator that yields only the best `nkeep` molecules out of the first
    `nsample` molecules in `products`, compared by `prop`. By default, the
    highest values of `prop` are kept; when `reverse` is true, the lowest values
    are kept instead.

    NOTE: this function holds all `nkeep` molecules in memory at once.

    :param products: molecules to filter
    :type products: iterator of rdkit.Chem.Mol

    :param nkeep: maximum number of molecules to yield
    :type nkeep: int

    :param nsample: maximum number of molecules to consume from `products`
    :type nsample: int

    :param prop: key property name
    :type prop: str

    :param reverse: if true, keep the lowest instead of the highest values
    :type reverse: bool

    :return: filtered/sorted products
    :rtype: generator of rdkit.Chem.Mol
    """
    heap = []
    for i, mol in enumerate(products, 1):
        cmol = ComparableMol(mol, prop, reverse)
        if len(heap) < nkeep:
            heapq.heappush(heap, cmol)
        else:
            heapq.heappushpop(heap, cmol)
        if i == nsample:
            break
    heap.sort()
    heap.reverse()
    for cmol in heap:
        yield cmol.mol


[docs]def get_filter(filename, smarts_filter=False):
    """
    Factory function to generate a Filter-like object from a filename. Supports
    JSON and propfilter-like property filters files as well as canvasSearch-like
    SMARTS filter files.

    :param filename: filename
    :type filename: str

    :param smarts_filter: is this a SMARTS filter file?
    :type smarts_filter: bool

    :return: filter object
    :rtype: schrodinger.structutils.filter.Filter
    """
    if smarts_filter:
        filter_class = filter.SmartsFilter
    elif filename and filename.endswith('.json'):
        filter_class = JSONFilterAdapter
    else:
        filter_class = filter.PropFilter
    try:
        return filter_class(filename=filename)
    except ValueError as e:
        sys.exit("%s:%s" % (filename, e))


[docs]@functools.lru_cache()
def get_fingerprint(mol):
    """
    A cached version of RDKit's FingerprintMol, so if it's called again with a
    recently used molecule, we'll save the recalculation of the fingerprint.

    :param mol: molecule
    :type mol: rdkit.Chem.rdchem.Mol

    :return: fingerprint
    :rtype: rdkit.DataStructs.cDataStructs.ExplicitBitVect
    """
    # Note: the LRU cache resulted in an almost 40% reduction in CPU time in a
    # test of a one-step enumeration with SMILES output.
    return FingerprintMols.FingerprintMol(mol)


[docs]def compute_similarity(mol1, mol2):
    """
    Compute the Tanimoto similarity between the RDKit fingerprints of two
    molecules.

    :param mol1: molecule
    :type mol1: rdkit.Chem.rdchem.Mol

    :param mol2: molecule
    :type mol2: rdkit.Chem.rdchem.Mol

    :return: similarity
    :rtype: float
    """
    fp1 = get_fingerprint(mol1)
    fp2 = get_fingerprint(mol2)
    return DataStructs.FingerprintSimilarity(fp1, fp2)


[docs]def add_descriptors(mol, descriptor_names, refs):
    """
    Compute the requested RDKit descriptors and store them as properties of the
    Mol object.

    :type mol: Mol
    :type descriptor_names: list of str

    :param refs: list of reference molecules (only used if similarity
        descriptors are requested)
    :type refs: sequence of Mol

    :return: input mol (modified in place)
    :rtype: Mol
    """
    for name in descriptor_names:
        if name.startswith(SIMILARITY_BASE):
            ref_index = int(name.split('_')[-1])
            value = compute_similarity(mol, refs[ref_index - 1])
        else:
            # We accept both full and short prop name for RDKit props.
            name = re.sub(r'^._rdkit_', '', name)
            func = DESCRIPTORS_DICT[name]
            try:
                value = func(mol)
            except ValueError:
                # This has been observed when an invalid molecule nevertheless
                # passed the previous SanitizeMol call, due to the RDKit bug
                # described in ENUM-242. So if any descriptors failed to be
                # computed, we'll just skip the molecule.
                return None
        if "Mean" not in name and ("Num" in name or "Count" in name or
                                   name.startswith("fr_")):
            mol.SetIntProp(name, value)
        else:
            mol.SetDoubleProp(name, value)
    return mol


[docs]def add_descriptors_gen(mols, descriptors, refs=None):
    """
    Given a generator of Mol, return a generator of Mol with descriptors added.
    The original Mol objects are modified.

    :param descriptors: names of the descriptors to add
    :type descriptors: sequence of str

    :param refs: list of reference molecules (only used if similarity
        descriptors are requested)
    :type refs: sequence of Mol

    """
    for mol in mols:
        if mol is not None:
            mol = add_descriptors(mol, descriptors, refs)
        if mol is not None:
            yield mol


[docs]def add_filters(products,
                ref_mols_file=None,
                ref_mols=None,
                property_filter_file=None,
                smarts_filter_file=None,
                descriptors=''):
    if ref_mols:
        refs = ref_mols
    elif ref_mols_file:
        refs = list(molio.get_mol_reader(ref_mols_file))
    else:
        refs = []
    prop_filter = get_filter(property_filter_file)
    try:
        smarts_filter = get_filter(smarts_filter_file, smarts_filter=True)
    except ValueError as e:
        sys.exit("%s:%s" % (smarts_filter_file, e))
    descriptors = set(d for d in descriptors.split(',') if d)
    descriptors |= {f'{SIMILARITY_BASE}_{i}' for i in range(1, len(refs) + 1)}
    descriptors |= prop_filter.getPropertyNames()
    products = add_descriptors_gen(products, descriptors, refs)
    products = prop_filter.filter(products)
    products = smarts_filter.filter(products)
    return products