Source code for schrodinger.livedesign.molhash

"""
Generate a unique hash code for a molecule based on chemistry. If two
molecules are chemically "the same", they should have the same hash.

Used by Schrödinger's LiveDesign to determine if two molecules are
the same. LiveDesign makes changes to the molecule before molhash,
somewhat equivalent the steps available in
rdkit.Chem.MolStandardize.

Using molhash adds value beyond using SMILES because it:

* Ignores SMILES features that are not chemically meaningful
  (e.g. atom map numbers)
* Canonicalizes enhanced stereochemistry groups. For example
  `C[C@H](O)CC |&1:1|` and `C[C@@H](O)CC |&1:1|` have the same
  molhash
* Canonicalizes S group data (for example, polymer data)

There are two hash schemes, the default, and one in which
tautomers are considered equivalent.


Copyright (C) 2022 Schrödinger, LLC

"""

import enum
import hashlib
import json
import re
from typing import Iterable
from typing import Optional

from rdkit import Chem
from rdkit.Chem import EnumerateStereoisomers
from rdkit.Chem import rdMolHash

ATOM_PROP_MAP_NUMBER = 'molAtomMapNumber'

EXTRA_ISOTOPE_REMOVAL_REGEX = re.compile(r'\[[1-3]0*([1-9]?[0-9]*[A-Z][a-z]?)@')
ENHANCED_STEREO_GROUP_REGEX = re.compile(r'((?:a|[&o]\d+):\d+(?:,\d+)*)')

ENHANCED_STEREO_GROUP_WEIGHTS = {
    Chem.StereoGroupType.STEREO_AND: 1000,
    Chem.StereoGroupType.STEREO_OR: 2000,
    Chem.StereoGroupType.STEREO_ABSOLUTE: 3000,
}

EMPTY_MOL_TAUTOMER_HASH = "_0_0"


[docs]class HashLayer(enum.Enum): """ :cvar CANONICAL_SMILES: RDKit canonical SMILES (excluding enhanced stereo) :cvar ESCAPE: arbitrary other information to be incorporated :cvar FORMULA: a simple molecular formula for the molecule :cvar NO_STEREO_SMILES: RDKit canonical SMILES with all stereo removed :cvar NO_STEREO_TAUTOMER_HASH: the above tautomer hash lacking all stereo :cvar SGROUP_DATA: canonicalization of all SGroups data present :cvar TAUTOMER_HASH: SMILES-like representation for a generic tautomer form See SS-30145 for more documentation and example jupyter notebook """ CANONICAL_SMILES = enum.auto() ESCAPE = enum.auto() FORMULA = enum.auto() NO_STEREO_SMILES = enum.auto() NO_STEREO_TAUTOMER_HASH = enum.auto() SGROUP_DATA = enum.auto() TAUTOMER_HASH = enum.auto()
[docs]@enum.unique class HashScheme(enum.Enum): """ Which hash layers to use to when deduplicating molecules Typically the "ALL_LAYERS" scheme is used, but some users may want the "TAUTOMER_INSENSITIVE_LAYERS" scheme. :cvar ALL_LAYERS: most strict hash scheme utilizing all layers :cvar STEREO_INSENSITIVE_LAYERS: excludes stereo sensitive layers :cvar TAUTOMER_INSENSITIVE_LAYERS: excludes tautomer sensitive layers """ ALL_LAYERS = tuple(HashLayer) STEREO_INSENSITIVE_LAYERS = ( HashLayer.ESCAPE, HashLayer.FORMULA, HashLayer.NO_STEREO_SMILES, HashLayer.NO_STEREO_TAUTOMER_HASH, HashLayer.SGROUP_DATA, ) TAUTOMER_INSENSITIVE_LAYERS = ( HashLayer.ESCAPE, HashLayer.FORMULA, HashLayer.NO_STEREO_TAUTOMER_HASH, HashLayer.SGROUP_DATA, HashLayer.TAUTOMER_HASH, )
[docs]def get_molhash(all_layers, hash_scheme: HashScheme = HashScheme.ALL_LAYERS) -> str: """ Generate a molecular hash using a specified set of layers. :param mol: the molecule to generate the hash for :param hash_scheme: enum encoding information layers for the hash :return: hash for the given scheme constructed from the input layers """ h = hashlib.sha1() for layer in hash_scheme.value: h.update(all_layers[layer].encode()) return h.hexdigest()
[docs]def get_mol_layers(original_molecule: Chem.rdchem.Mol, data_field_names: Optional[Iterable] = None, escape: Optional[str] = None) -> set(HashLayer): """ Generate layers of data about that could be used to identify a molecule :param original_molecule: molecule to obtain canonicalization layers from :param data_field_names: optional sequence of names of SGroup DAT fields which will be included in the hash. :param escape: optional field which can contain arbitrary information :return: dictionary of HashLayer enum to calculated hash """ # Work on a copy with all non-stereogenic Hydrogens removed mol = _remove_unnecessary_hs(original_molecule, preserve_stereogenic_hs=True) strip_atom_map_labels(mol) formula = rdMolHash.MolHash(mol, rdMolHash.HashFunction.MolFormula) cxsmiles, canonical_mol = canonicalize_stereo_groups(mol) tautomer_hash = get_stereo_tautomer_hash(canonical_mol) canonical_smiles = get_canonical_smiles(cxsmiles) sgroup_data = canonicalize_sgroups(canonical_mol, dataFieldNames=data_field_names) no_stereo_tautomer_hash, no_stereo_smiles = get_no_stereo_layers( canonical_mol) return { HashLayer.CANONICAL_SMILES: canonical_smiles, HashLayer.ESCAPE: escape or "", HashLayer.FORMULA: formula, HashLayer.NO_STEREO_SMILES: no_stereo_smiles, HashLayer.NO_STEREO_TAUTOMER_HASH: no_stereo_tautomer_hash, HashLayer.SGROUP_DATA: sgroup_data, HashLayer.TAUTOMER_HASH: tautomer_hash, }
[docs]def strip_atom_map_labels(mol): for at in mol.GetAtoms(): if at.HasProp(ATOM_PROP_MAP_NUMBER): at.ClearProp(ATOM_PROP_MAP_NUMBER)
def _remove_unnecessary_hs(rdk_mol, preserve_stereogenic_hs=False): """ removes hydrogens that are not necessary for the registration hash, and preserves hydrogen isotopes """ remove_hs_params = Chem.RemoveHsParameters() remove_hs_params.updateExplicitCount = True remove_hs_params.removeDefiningBondStereo = not preserve_stereogenic_hs edited_mol = Chem.rdmolops.RemoveHs(rdk_mol, remove_hs_params, sanitize=False) edited_mol.UpdatePropertyCache(False) return edited_mol
[docs]def get_stereo_tautomer_hash(molecule): if molecule.GetNumAtoms() == 0: return EMPTY_MOL_TAUTOMER_HASH # SHARED-7909: workaround for https://github.com/rdkit/rdkit/issues/4234 # This can be removed once we update to an RDKit version without this bug. no_h_mol = _remove_unnecessary_hs(molecule) no_h_mol.UpdatePropertyCache(False) # setting useCxSmiles param value to always include enhanced stereo info useCxSmiles = True hash_with_cxExtensions = rdMolHash.MolHash( no_h_mol, rdMolHash.HashFunction.HetAtomTautomer, useCxSmiles) # since the cxSmiles can include anything, we want to only include # enhanced stereo information canonical_smiles = get_canonical_smiles(hash_with_cxExtensions) return canonical_smiles
[docs]def get_canonical_smiles(cxsmiles): smiles_parts = (p.strip() for p in cxsmiles.split('|')) smiles_parts = [p for p in smiles_parts if p] if not smiles_parts: return '', '' elif len(smiles_parts) > 2: raise ValueError('Unexpected number of fragments in canonical CXSMILES') canonical_smiles = smiles_parts[0] stereo_groups = '' if len(smiles_parts) == 2: # note: as with many regex-based things, this is fragile groups = ENHANCED_STEREO_GROUP_REGEX.findall(smiles_parts[1]) if groups: # We might have other CXSMILES extensions that aren't stereo groups stereo_groups = f"|{','.join(sorted(groups))}|" if stereo_groups: return canonical_smiles + " " + stereo_groups return canonical_smiles
[docs]def get_no_stereo_layers(mol): # SHARED-7909: Strip all Hs, including stereogenic ones. no_stereo_mol = _remove_unnecessary_hs(mol) no_stereo_mol.UpdatePropertyCache(False) Chem.rdmolops.RemoveStereochemistry(no_stereo_mol) no_stereo_tautomer_hash = rdMolHash.MolHash( no_stereo_mol, rdMolHash.HashFunction.HetAtomTautomer) no_stereo_smiles = rdMolHash.MolHash(no_stereo_mol, rdMolHash.HashFunction.CanonicalSmiles) return (no_stereo_tautomer_hash, no_stereo_smiles)
[docs]def get_canonical_atom_ranks_and_bonds(mol, useSmilesOrdering=True): """ returns a 2-tuple with: 1. the canonical ranks of a molecule's atoms 2. the bonds expressed as (canonical_atom_rank_1,canonical_atom_rank_2) where canonical_atom_rank_1 < canonical_atom_rank_2 If useSmilesOrdering is True then the atom indices here correspond to the order of the atoms in the canonical SMILES, otherwise just the canonical atom order is used. useSmilesOrdering=True is a bit slower, but it allows the output to be linked to the canonical SMILES, which can be useful. """ if mol.GetNumAtoms() == 0: return [], [] if not useSmilesOrdering: atRanks = list(Chem.CanonicalRankAtoms(mol)) else: smi = Chem.MolToSmiles(mol) ordertxt = mol.GetProp("_smilesAtomOutputOrder") smiOrder = [int(x) for x in ordertxt[1:-1].split(",") if x] atRanks = [0] * len(smiOrder) for i, idx in enumerate(smiOrder): atRanks[idx] = i # get the bonds in canonical form bndOrder = [] for bnd in mol.GetBonds(): bo = atRanks[bnd.GetBeginAtomIdx()] eo = atRanks[bnd.GetEndAtomIdx()] if bo > eo: bo, eo = eo, bo bndOrder.append((bo, eo)) return atRanks, bndOrder
[docs]def canonicalize_data_sgroup(sg, atRanks, bndOrder, fieldNames=None, sortAtomOrder=True): """ NOTES: if sortAtomOrder is true then the atom list will be sorted. This assumes that the order of the atoms in that list is not important """ if sg.GetProp("TYPE") != "DAT" or not sg.HasProp("FIELDNAME"): return None canonicalizableFieldNames = fieldNames or ["Atrop"] fieldName = sg.GetProp("FIELDNAME") if fieldName not in canonicalizableFieldNames: return None data = sg.GetStringVectProp("DATAFIELDS") if len(data) > 1: raise ValueError( "cannot canonicalize data groups with multiple data fields") data = data[0] ats = tuple(atRanks[x] for x in sg.GetAtoms()) if sortAtomOrder: ats = tuple(sorted(ats)) bnds = tuple(bndOrder[x] for x in sg.GetBonds()) res = dict(fieldName=fieldName, atom=ats, bonds=bnds, value=data) return res
[docs]def getCanonicalBondRep(bond, atomRanks): aid1 = bond.GetBeginAtomIdx() aid2 = bond.GetEndAtomIdx() if atomRanks[aid1] > atomRanks[aid2] or (atomRanks[aid1] == atomRanks[aid2] and aid1 > aid2): aid1, aid2 = aid2, aid1 return (aid1, aid2)
[docs]def canonicalize_sru_sgroup(mol, sg, atRanks, bndOrder, sortAtomAndBondOrder): """ NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted. This assumes that the ordering of those lists is not important """ if sg.GetProp("TYPE") != "SRU": return None ats = tuple(atRanks[x] for x in sg.GetAtoms()) bnds = tuple(bndOrder[x] for x in sg.GetBonds()) if sortAtomAndBondOrder: ats = tuple(sorted(ats)) bnds = tuple(sorted(bnds)) props = sg.GetPropsAsDict() res = dict( type="SRU", atoms=ats, bonds=bnds, index=props.get("index", 0), connect=props.get("CONNECT", ""), label=props.get("LABEL", ""), ) if "PARENT" in props: res["PARENT"] = props["PARENT"] if "XBHEAD" in props: xbhbonds = tuple( getCanonicalBondRep(mol.GetBondWithIdx(x), atRanks) for x in props["XBHEAD"]) if sortAtomAndBondOrder: xbhbonds = tuple(sorted(xbhbonds)) res["XBHEAD"] = xbhbonds if "XBCORR" in props: xbcorrbonds = tuple( getCanonicalBondRep(mol.GetBondWithIdx(x), atRanks) for x in props["XBCORR"]) if len(xbcorrbonds) % 2: raise ValueError("XBCORR should have 2N bonds") if sortAtomAndBondOrder: # these are pairs, so we need to sort them as such tmp = [] for i in range(0, len(xbcorrbonds), 2): b1, b2 = xbcorrbonds[i], xbcorrbonds[i + 1] if b1 > b2: b1, b2 = b2, b1 tmp.append((b1, b2)) xbcorrbonds = tuple(sorted(tmp)) res["XBCORR"] = xbcorrbonds return res
[docs]def canonicalize_cop_sgroup(sg, atRanks, sortAtomAndBondOrder): """ NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted. This assumes that the ordering of those lists is not important """ if sg.GetProp("TYPE") != "COP": return None ats = tuple(atRanks[x] for x in sg.GetAtoms()) if sortAtomAndBondOrder: ats = tuple(sorted(ats)) props = sg.GetPropsAsDict() # we are intentionally not doing the label here. Marvin adds one, biovia draw does not res = dict(type="COP", atoms=ats, index=props.get("index", 0)) return res
[docs]def canonicalize_sgroups(mol, dataFieldNames=None, sortAtomAndBondOrder=True): """ NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted. This assumes that the ordering of those lists is not important """ dataFieldNames = dataFieldNames or ["Atrop"] atRanks, bndOrder = get_canonical_atom_ranks_and_bonds(mol) res = [] for sg in Chem.GetMolSubstanceGroups(mol): lres = None if sg.GetProp("TYPE") == "DAT": lres = canonicalize_data_sgroup(sg, atRanks, bndOrder, dataFieldNames, sortAtomAndBondOrder) elif sg.GetProp("TYPE") == "SRU": lres = canonicalize_sru_sgroup(mol, sg, atRanks, bndOrder, sortAtomAndBondOrder) elif sg.GetProp("TYPE") == "COP": lres = canonicalize_cop_sgroup(sg, atRanks, sortAtomAndBondOrder) if lres is not None: res.append(lres) if len(res) > 1: # we need to sort them, but dicts don't let you do that directly: tres = sorted(tuple(x.items()) for x in res) res = tuple(dict(x) for x in tres) # update "index" and "PARENT" props idxmap = {} for i, itm in enumerate(res): if "index" in itm: idxmap[itm["index"]] = i + 1 itm["index"] = i + 1 for i, itm in enumerate(res): if "PARENT" in itm: itm["PARENT"] = idxmap[itm["PARENT"]] return json.dumps(res)
[docs]class EnhancedStereoUpdateMode(enum.Enum): ADD_WEIGHTS = enum.auto() REMOVE_WEIGHTS = enum.auto()
[docs]def update_enhanced_stereo_group_weights(mol, mode): if mode == EnhancedStereoUpdateMode.ADD_WEIGHTS: factor = 1 elif mode == EnhancedStereoUpdateMode.REMOVE_WEIGHTS: factor = -1 else: raise ValueError('Invalid Enhanced Stereo weight update mode') isotopesModified = False stgs = mol.GetStereoGroups() for stg in stgs: stgt = stg.GetGroupType() weight = factor * ENHANCED_STEREO_GROUP_WEIGHTS[stgt] for at in stg.GetAtoms(): # Make sure the isotope is reasonable and is not present in more than # one stereo group, and we are not messing it up isotope = at.GetIsotope() if mode == EnhancedStereoUpdateMode.ADD_WEIGHTS and isotope >= 100: raise ValueError( f'Atom {at.GetIdx()} is an unusual isotope ({isotope})') at.SetIsotope(isotope + weight) isotopesModified = True return mol, isotopesModified
[docs]def canonicalize_stereo_groups(mol): """ Returns canonical CXSmiles and the corresponding molecule with the stereo groups canonicalized. The RDKit canonicalization code does not currently take stereo groups into account. We work around that by using EnumerateStereoisomers() to generate all possible instances of the molecule's stereogroups and then lexically compare the CXSMILES of those. """ if not len(mol.GetStereoGroups()): return Chem.MolToCXSmiles(mol), mol mol = Chem.Mol(mol) # add ring info if not initialized yet Chem.FastFindRings(mol) # To solve the problem we add isotope tags to the atoms involved in stereo # groups. These allow the canonicalization to tell the difference between # AND and OR (and have the happy side effect of allowing the # presence/absence of a stereo group to affect the canonicalization) mol, isotopesModified = update_enhanced_stereo_group_weights( mol, EnhancedStereoUpdateMode.ADD_WEIGHTS) # We're going to be generating canonical SMILES here anyway, so skip the # "unique" option for the sake of efficiency opts = EnumerateStereoisomers.StereoEnumerationOptions() opts.onlyStereoGroups = True opts.unique = False res = None res_csmi = '' # before enumerating stereoisomers, we strip the bond directions to prevent # rdkit from getting confused by an atom having multiple bond directions. # See SHARED-8757 for bond in mol.GetBonds(): bond.SetBondDir(Chem.BondDir.NONE) for isomer in EnumerateStereoisomers.EnumerateStereoisomers(mol, opts): # We need to generate the canonical CXSMILES for the molecule with # the isotope tags. csmi = Chem.MolToCXSmiles(isomer) if res is None or csmi < res_csmi: res = isomer res_csmi = csmi res_csmi = EXTRA_ISOTOPE_REMOVAL_REGEX.sub(r'[\1@', res_csmi) if isotopesModified: res, _ = update_enhanced_stereo_group_weights( res, EnhancedStereoUpdateMode.REMOVE_WEIGHTS) return res_csmi, res