Source code for schrodinger.application.phase.packages.ligand_aligner

"""
This module contains the LigandAligner class, which uses a best practices
approach to flexibly align a set of ligand structures. The basic procedure
is as follows:

1. A Bemis-Murcko scaffold analysis is performed, and structures are clustered
   according to their largest scaffold, where scaffolds are represented using
   bond orders but not elemental types (i.e., fuzzy scaffolds).
2. Clusters are sorted by decreasing scaffold size, and the structures within
   each cluster are sorted by decreasing size to establish a canonical order
   for all subsequent work. The very first structure in the canonical list is
   the primary reference, and subsequent structures in the list tend to be
   aligned to reference structures that appear earlier in the list.
3. A scaffold-based distance matrix is computed, where the i, j entry is the
   scaffold number (0, 1, etc.) of the largest scaffold that's shared by
   structures i and j. Larger scaffolds have lower scaffold numbers, so the
   scaffold number can be used as a distance. When constructing the matrix,
   embedded matching is employed (e.g., imidazole matches benzimidazole) in
   order to increase the size of the largest shared scaffold.
4. A minimum spanning tree is constructed from the distance matrix, with
   structures being added to the tree using the canonical order described in
   step 2. Each edge of the tree holds a structure, the reference to which it
   is to be aligned, and the largest scaffold they share. The ordered edges of
   this tree provide a prescription for sequentially aligning each structure
   to a previously aligned reference with which it shares the largest possible
   scaffold.
5. The ordered edges of the minimum spanning tree are traversed, with the
   applicable alignment method (see AlignMethod below) being used to find the
   best superposition of a given structure to its reference. Once a structure
   is aligned to its reference, the aligned structure replaces the original
   structure, and it is made available as a reference for subsequent structures
   in the tree.
6. The structures for which core snapping was successful are post-processed
   in an attempt to superimpose sidechains that are common to two or more
   structures.
7. The aligned structures are returned in the order they were provided.

If a particular structure is designated as the reference to which all other
structures are to be aligned, steps 1-5 are appropriately modified or skipped.
Furthermore, if a SMARTS-based core is designated or MCS-based cores are
requested, Bemis-Murcko scaffold detection is disabled and the indicated core
is used in lieu of the largest shared Bemis-Murcko scaffold. In the case of
MCS, bond orders and elemental types must match, ring atoms may match only
other ring atoms, and complete rings must be matched.

If refinement is requested, additional conformers are generated for each
structure that was successfully snapped to its reference, with core atoms and
snapped sidechains being held fixed. The resulting conformers are used to
systematically increase the average in-place shape similarity over all pairs
of structures, whether snapped or not. Conformers are not generated for the
the primary reference, a user designated reference, or any structure that
was not snapped to its reference.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import copy
import itertools
from enum import Enum

import numpy
from rdkit import Chem
from rdkit.Chem import rdFMCS

from schrodinger import structure
from schrodinger.infra import canvas
from schrodinger.infra import fast3d
from schrodinger.infra import mm
from schrodinger.infra import phase
from schrodinger.infra.canvas import ChmMmctAdaptor
from schrodinger.rdkit import rdkit_adapter
from schrodinger.structutils import analyze
from schrodinger.utils import log

# Designation for using MCS-based cores:
MCS_CORES = "MCS"

# Conformational sampling method:
SAMPLING_METHODS = {
    phase.CONF_SAMPLE_COARSE_NAME: "basic",
    phase.CONF_SAMPLE_FINE_NAME: "default"
}

# Method used to align a particular structure to its reference:
AlignMethod = Enum("AlignMethod", "snap_core least_squares shape_based")
ALIGN_METHOD_DESCRIPTION = {
    AlignMethod.snap_core: "snapped core",
    AlignMethod.least_squares: "flexible least-squares core",
    AlignMethod.shape_based: "flexible shape-based"
}

# Note that the preferred method is to snap the core onto the reference and
# conformationally sample the region outside the core. If the core cannot be
# snapped because of a stereo failure, open ring failure, etc., the entire
# structure is conformationally sampled and each conformer is aligned using
# least-squares superposition of the core atoms. If the structure shares no
# scaffolds with its reference, fully-flexible shape-based alignment is done.
# The best alignment, regardless of method, is the superposition that yields
# the highest all-atom shape similarity with atomic overlaps differentiated by
# elemental type.

# Aligned structures contain the following properties:
INPUT_STRUCTURE_NUMBER_PROP = "i_phase_Input_Structure_Number"
OUTPUT_STRUCTURE_NUMBER_PROP = "i_phase_Structure_Number"
REFERENCE_STRUCTURE_PROP = "i_phase_Reference_Structure"
SHAPE_SIM_PROP = "r_phase_Similarity_to_Reference"
ALIGN_METHOD_PROP = "s_phase_Alignment_Method"
ALIGN_REASON_PROP = "s_phase_Alignment_Reason"
CORE_SMARTS_PROP = "s_phase_Core_SMARTS"

# Atom coordinate tolerance for detecting common snapped cores:
ATOM_COORD_TOL = 1.0e-4

# Tolerance for detecting distorted bonds:
DISTORTED_BOND_TOL = 0.5

# Default maximum number of conformers to generate:
DEFAULT_MAX_CONFS = 1000


[docs]class LigandAlignerOptions: """ Holds options that control alignment of structures. """
[docs] def __init__(self): # Whether to refine superpositons after the basic alignment procedure. self.refine = False # Whether to immediately fail with a RuntimeError if any structures # contain multiple fragments or no rings. If False, those structures # are quietly skipped. self.fail_on_bad = False # Whether to skip the post-processing step wherein chemically identical # sidechains are snapped onto one another in pairs of structures that # share a common snapped core. self.ignore_sidechains = False # Whether to align hydrogens on terminal rotatable heavy atoms (-CH3, # -NH2, -OH, etc.) if the heavy atoms are part of the core. If False, # those hydrogens will be aligned only if the conformational sampling # yields a torsion angle that superimposes them. self.align_terminal_hydrogens = True # Conformational sampling method. Empirical evidence suggests that # fast3d "basic" sampling, which corresponds to Phase coarse/rapid # sampling, tends to yield the most consistent superpositions. self.sampling_method = phase.CONF_SAMPLE_COARSE_NAME # Maximum number of conformers to generate for each input structure. # Must be > 0. self.max_confs = DEFAULT_MAX_CONFS # Whether to minimize conformers. self.minimize_confs = False # Whether to replace the primary reference structure with a sampled # fast3d conformer that yields the best shape-based superposition to # the input primary reference structure. This can sometimes result in # better consensus alignments since the primary reference structure # is then an actual fast3d conformer, and as such may produce better # superpositions with fast3d conformers generated for other structures. self.use_sampled_ref = False # Tolerance for detection of close contacts in snapped structures. A # tolerance of 0 turns off close contact detection. self.close_contact_tol = phase.DEFAULT_CLOSE_CONTACT_TOL
[docs]class LigandAligner: """ Generates a consensus alignment for a set of ligands using the best practices approach described at the top of this module. """
[docs] def __init__(self, options=None, logger=None): """ Constructor taking options that control the alignment behavior and a logger for printed output. :param options: Alignment options. See also setOptions. :type options: LigandAlignerOptions :param logger: Logger for printed output. Informative messages about the alignment process are issued at the INFO level; additional messages triggered by core snapping failures are issed at the DEBUG level. See also setLogger. :type logger: Logger """ self.setOptions(options or LigandAlignerOptions()) self.setLogger(logger or log.get_output_logger(__file__))
[docs] def align(self, sts, ref_index=None, core=""): """ Aligns the supplied structures using core snapping, flexible least-squares core alignment, or flexible shape-based superposition. See setOptions to modify the default alignment behavior. :param sts: The structures to be aligned. :type sts: list(structure.Structure) :param ref_index: 1-based reference structure number. Overrides the basic algorithm that chooses reference structures and determines the order in which alignments are performed. :type ref_index: int :param core: If a reference structure is designated, core may be a SMARTS string that matches the reference structure, or it may be MCS_CORES. :type core: str :return: The aligned structures in the order they were provided, minus any fragmented structures or structures lacking rings. :rtype: list(structure.Structure) """ if core and not ref_index: mesg = "Must designate a reference structure when specifying core" raise RuntimeError(mesg) # Initialize all data that will be created during the alignment # workflow. self._initAlignData(ref_index, core) # Filter out bad structures, create ChmMol objects and, if # applicable, rdkit.Mol objects for MCS. sts_filtered, self._mols = self._setupAlign(sts) if len(sts_filtered) < 2: raise RuntimeError("Must supply 2 or more valid structures") self._checkCore() # Align the valid structures. return self._align(sts_filtered)
[docs] def getLogger(self): """ Returns the logger used for output messages. :return: The logger. :rtype: Logger """ return self._logger
[docs] def getOptions(self): """ Returns a copy of the options that control the alignment process and the level of printed output. :return: The options. :rtype: LigandAlignerOptions """ return copy.copy(self._options)
[docs] def setLogger(self, logger): """ Sets the logger to be used for printing messages during the alignment process. :param logger: The logger. :type logger: Logger """ self._logger = logger
[docs] def setOptions(self, options): """ Sets options that control the alignment process and the level of output printed. :param options: The options. :type options: LigandAlignerOptions """ if options.max_confs < 1: raise ValueError("Maximum number of conformers must be > 0") mode = SAMPLING_METHODS[options.sampling_method] use_default_frag_lib = True custom_frag = [] self._f3d = fast3d.Engine(mode, options.minimize_confs, use_default_frag_lib, custom_frag) self._f3d.setDesiredNumberOfConformers(options.max_confs) self._options = copy.copy(options)
def _align(self, sts): """ Aligns the supplied structures using core snapping, flexible least-squares core alignment, or flexible shape-based superposition. Structures are modified directly. :param sts: The structures to be aligned. :type sts: list(structure.Structure) :return: The aligned structures in the order they were provided. :rtype: list(structure.Structure) """ if not self._core: self._logger.info(f"\nFinding scaffolds for {len(sts)} structures") finder = canvas.ChmScaffoldFinder(self._mols, False, False) self._scaffold_count = finder.getScaffoldCount() mesg = f"Total number of scaffolds = {self._scaffold_count}" self._logger.info(mesg) # Store fuzzy SMARTS for each scaffold. for i in range(self._scaffold_count): smarts = finder.getScaffold(i).getFuzzyScaffold(2) # As per PHASE-2111, allow any aromatic nitrogen to match any # other aromatic nitrogen to protect against inconsistent # assignment of tautomeric states. self._fuzzy_smarts.append(smarts.replace("[aH]", "[a]")) elif self._core == MCS_CORES: self._logger.info(f"\nFinding MCS cores for {len(sts)} structures") self._findMcsCores(sts) else: self._fuzzy_smarts = [self._core] if not self._ref_index: # Create minimum spanning tree whose edges determine the order in # which structures are aligned. The following prerequisite work is # also done: # 1. Clustering structures by largest scaffold. # 2. Generation of a canonical structure order. # 3. Construction of the scaffold-based distance matrix. self._createMinimumSpanningTree() else: # Create a tree that traverses the input structures in the order # provided, with the user-designated reference being first. self._createLinearSpanningTree() # Prepare primary reference structure. prim_ref = self._mst_edges[0][0] sts[prim_ref] = self._preparePrimaryRef(sts[prim_ref]) # Perform alignments in the order prescribed by the edges of the # minimum/linear spanning tree. self._logger.info("") n = len(self._mst_edges) for i, edge in enumerate(self._mst_edges): stA = sts[edge[0]] stB = sts[edge[1]] smarts = "" dAB = edge[2] # Index of largest shared scaffold or MCS if dAB != -1: smarts = self._fuzzy_smarts[dAB] progress = f"({i + 1} of {n})" mesg = f"Aligning {edge[1] + 1}-->{edge[0] + 1} {progress}" if smarts: mesg += f": Core SMARTS = {smarts}" self._logger.info(mesg) stB.property[REFERENCE_STRUCTURE_PROP] = edge[0] + 1 stB.property[CORE_SMARTS_PROP] = smarts stB_aligned = self._getBestAlignment(stA, stB, smarts) shape_sim = stB_aligned.property[SHAPE_SIM_PROP] self._logger.info(" Shape similarity = %.6f" % shape_sim) sts[edge[1]] = stB_aligned if not self._options.ignore_sidechains: sts = self._snapSideChains(sts) if self._options.refine: sts = self._refineAlignments(sts) return sts def _alignConformers(self, stA, stB, stB_confs, shape, core_mapping=None, append_confs=True): """ Finds the best superposition of the provided conformers of stB onto stA using shape similarity. If core_mapping is provided, the B->A mappings therein will be used to perform a least-squares alignment) of the applicable core atoms. Otherwise, conformers will be aligned using the standard PhpFastShape algorithm. Returns the highest shape similarity found and the associated aligned conformer of stB. :param stA: The fixed reference structure. :type stA: structure.Structure :param stB: The structure to be aligned onto stA. :type stB: structure.Structure :param stB_confs: fast3d conformers of stB. Each conformer is a tuple whose second element contains the atomic coordinates as a linear array of length 3 * num_atoms. :type stB_confs: list(fast3d.Conformation) :param shape: PhpFastShape object with stA as the reference. :type shape: phase.PhpFastShape :param core_mapping: Pairs of 1-based B->A core atom mappings. :type core_mapping: list((int, int)) :param append_confs: If True, stB and stB_confs are screened. Otherwise, only stB_confs are screened. :type append_confs: bool :return: Highest shape similarity and associated aligned structure. :rtype: float, structure.Structure """ mappingA = None mappingB = None inplace = False if core_mapping: mappingB, mappingA = zip(*core_mapping) inplace = True result_best = [0.0] stB_copy = stB.copy() stB_best = None first_conf = True # Prepending [None] to stB_confs allows incoming stB coordinates to # be used. for stB_conf in [None] + stB_confs: if stB_conf: # Note that the shape of stB_conf[1] prevents it from being # supplied directly to stB_copy.setXYZ. mm.mmct_ct_set_all_xyz(stB_copy, stB_conf[1]) elif not append_confs: continue if core_mapping: phase.align_least_squares(stA, mappingA, stB_copy, mappingB, False) if self._options.align_terminal_hydrogens: phase.align_terminal_hydrogens(stA, stB_copy, core_mapping) result = shape.computeShapeSimPy(stB_copy, first_conf, inplace) first_conf = False if result[0] > result_best[0]: result_best = list(result) stB_best = stB_copy.copy() if not inplace: shape.alignCtPy(stB_best.handle, result_best) stB_best.property[SHAPE_SIM_PROP] = result_best[0] return result_best[0], stB_best def _alignLeastSquares(self, stA, stB, core_mappings): """ Generates conformers for stB, with stB->stA least-squares alignment on the core atoms, and finds the conformer that yields the highest shape similarity to the reference structure stA. :param stA: The fixed reference structure. :type stA: structure.Structure :param stB: The structure to be aligned onto stA. :type stB: structure.Structure :param core_mappings: Lists of 1-based B-->A atom mappings that define the various ways the core of stB is to be aligned to stA. :type core_mappings: list(list((int, int))) :return: Best alignment over all generated conformers of stB. :rtype: structure.Structure """ shape = self._getShape(stA) stB_confs = self._f3d.run(stB) sim_best = 0.0 stB_aligned_best = None core_mapping_best = None for core_mapping in core_mappings: sim, stB_aligned = self._alignConformers(stA, stB, stB_confs, shape, core_mapping) if sim > sim_best: sim_best = sim stB_aligned_best = stB_aligned core_mapping_best = list(core_mapping) st_number = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 self._least_squares_core_mappings[st_number] = core_mapping_best return stB_aligned_best def _alignShapes(self, stA, stB, append_confs=True): """ Generates conformers for stB and finds the conformer that yields the best shape-based alignment to the reference structure stA. :param stA: The fixed reference structure. :type stA: structure.Structure :param stB: The structure to be aligned onto stA. :type stB: structure.Structure :param append_confs: If True, stB and the generated conformers are screened. Otherwise, only the conformers are screened. :type append_confs: bool :return: Best alignment over all generated conformers of stB. :rtype: structure.Structure """ shape = self._getShape(stA) return self._alignConformers(stA, stB, self._f3d.run(stB), shape, append_confs=append_confs)[1] def _alignSnappedCore(self, stA, snappedB, core_mappings): """ Generates conformers for each snapped structure in snappedB, keeping the core atoms frozen, and finds the conformer that yields the highest shape similarity to the reference structure stA. If refinement is turned on, this function also fills self._frozen_refinement_atoms with a first approximation of the atoms to hold fixed during refinement. The sets are typically expanded as rigid regions are grown when snapping sidechains. :param stA: The fixed reference structure. :type stA: structure.Structure :param snappedB: The snapped structures to be aligned onto stA. :type snappedB: list(structure.Structure) :param core_mappings: 1-based B->A core atom mappings for each structure in snappedB. :type core_mappings: list(list((int, int))) :return: Best alignment over all generated conformers of snappedB. :rtype: structure.Structure """ shape = self._getShape(stA) frozen_prop = fast3d.Engine.FROZEN_ATOMS_PROPERTY_NAME # Note: The presence of frozen_prop signals that the core has been # snapped and should be held frozen. The core still needs to be # realigned, however, because fast3d doesn't keep the conformers # in the original frame of reference. sim_best = 0.0 stB_aligned_best = None core_mapping_best = None for stB, core_mapping in zip(snappedB, core_mappings): frozen_atoms = self._getFrozenAtoms(stB, core_mapping) frozen_atoms_string = " ".join(str(i) for i in frozen_atoms) stB_copy = stB.copy() stB_copy.property[frozen_prop] = frozen_atoms_string stB_confs = self._f3d.run(stB_copy) if frozen_prop in stB_copy.property: del stB_copy.property[frozen_prop] sim, stB_aligned = self._alignConformers(stA, stB_copy, stB_confs, shape, core_mapping) if sim > sim_best: sim_best = sim stB_aligned_best = stB_aligned core_mapping_best = list(core_mapping) st_number = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 if not self._options.ignore_sidechains: # Store best mapping for use in common core construction when # snapping identical sidechains. self._snapped_core_mappings[st_number] = core_mapping_best if self._options.refine: self._frozen_refinement_atoms[st_number] = set( self._getFrozenAtoms(stB, core_mapping_best)) return stB_aligned_best def _badCore(self, st1, st2, core1, core2): """ Returns True if core1 is too small or either core is disconnected. :param st1: First structure. :type st1: structure.Structure :param st2: Second structure. :type st2: structure.Structure :param core1: 1-based core atom numbers in first structure. :type core1: list(int) :param core2: 1-based core atom numbers in second structure. :type core2: list(int) :return: True if core is bad. :rtype: bool """ if len(core1) < 3: return True if not phase.is_connected_subgraph(st1.handle, core1): return True if not phase.is_connected_subgraph(st2.handle, core2): return True return False def _checkCore(self): """ Raises a RuntimeError if a user-supplied core SMARTS doesn't match the user-designated reference structure. """ smarts = self._core if smarts and smarts != MCS_CORES: mol = self._mols[self._ref_index - 1] query = canvas.ChmQuery(smarts) matcher = canvas.ChmQueryMatcher(True) if not matcher.hasMatch(query, mol): mesg = f'SMARTS "{smarts}" does not match reference structure' raise RuntimeError(mesg) def _clusterByScaffold(self): """ Clusters structures according to their largest scaffold. """ self._logger.info("\nClustering structures by largest scaffold") self._storeFuzzyScaffoldsInMol() already_clustered = set() nmol = len(self._mols) for iscaff in range(self._scaffold_count): source_structures = [] for jmol in range(nmol): if jmol not in already_clustered: if iscaff in self._scaffolds_in_mol[jmol]: source_structures.append(jmol) already_clustered.add(jmol) if source_structures: self._clusters[iscaff] = source_structures icluster = 0 for iscaff, source_structures in self._clusters.items(): icluster += 1 smarts = self._fuzzy_smarts[iscaff] ss_string = ", ".join(str(i + 1) for i in source_structures) self._logger.info(f"Cluster {icluster}:") self._logger.info(f" Scaffold SMARTS = {smarts}") self._logger.info(f" Structures: {ss_string}") def _createLinearSpanningTree(self): """ Creates a surrogate for the minimum spanning tree when a reference structure has been designated. That structure will appear in every edge of the tree, and the order of the edges will follow the input order of the other ligands. """ nmol = len(self._mols) ref_pos = self._ref_index - 1 self._canonical_order = [ref_pos] for i in range(nmol): if i != ref_pos: self._canonical_order.append(i) if self._core: if self._core == MCS_CORES: dmatrix = self._getMcsMatrix() self._scaffold_count = nmol else: dmatrix = self._getSmartsMatrix() self._scaffold_count = 1 else: self._storeFuzzyScaffoldsInMol() dmatrix = self._getScaffoldMatrix() for i in range(1, nmol): ipos = self._canonical_order[i] dij = dmatrix[ipos][ref_pos] if dij == self._scaffold_count: # No shared scaffold/SMARTS. dij = -1 self._mst_edges.append([ref_pos, ipos, dij]) def _createMinimumSpanningTree(self): """ Creates a minimum spanning tree using Prim's algorithm. The tree consists of edges [i, j, dij], where i and j are 0-based structure numbers, and dij is the 0-based scaffold number of the largest scaffold shared by the two structures. If they share no scaffolds, dij will be set to -1. """ # Cluster compounds by the largest scaffold they contain. self._clusterByScaffold() # Sort structures by cluster, and then by decreasing structure # size within each cluster. Clusters are already ordered by # decreasing size of the associated scaffold. size_ranks1 = [-mol.getHeavyAtomCount() for mol in self._mols] size_ranks2 = [-mol.getAtomCount() for mol in self._mols] size_ranks = list(zip(size_ranks1, size_ranks2)) ranks = [] for iscaff, source_structures in self._clusters.items(): for imol in source_structures: ranks.append((iscaff, size_ranks[imol], imol)) self._canonical_order = [row[2] for row in sorted(ranks)] dmatrix = self._getScaffoldMatrix() nmol = len(self._mols) in_mst = nmol * [0] # Start with primary reference structure and build edges of mst by # iteratively adding the closest structure not already in the mst. primary_ref = self._canonical_order[0] in_mst[primary_ref] = 1 mst_size = 1 big_dist = self._scaffold_count + 1 while mst_size < nmol: dmin = big_dist imin = None jmin = None # Always follow canonical order when adding edges so that larger # structures with larger scaffolds tend to appear earlier in tree. for i in self._canonical_order: if in_mst[i]: for j in self._canonical_order: if not in_mst[j] and dmatrix[i][j] < dmin: dmin = dmatrix[i][j] imin = i jmin = j in_mst[jmin] = 1 mst_size += 1 self._mst_edges.append([imin, jmin, dmin]) # Correct edges that correspond to no shared scaffold. These are the # edges for which flexible shape-based alignment must be done. for edge in self._mst_edges: if edge[2] == self._scaffold_count: edge[0] = primary_ref edge[2] = -1 def _failsRDKitConversion(self, st): """ Returns a non-blank error message if the supplied structure cannot be converted to an RDKit Mol. Returns an empty string otherwise. :param st: The structure to be checked :type st: structure.Structure :return: Error message or empty string :rtype: str """ try: rdkit_adapter.to_rdkit(st, implicitH=True) except Exception as e: return str(e) return '' def _filterMatches(self, matches, filter): """ Returns the rows in matches that contain the exact same atom numbers as in filter. :param matches: Lists of 1-based atom numbers. :type matches: list(list(int)) :param filter: List of 1-based atom numbers. :type filter: list(int) :return: Matches containing the same atom numbers as in filter. :rtype: list(list(int)) """ filter_set = set(filter) n = len(filter_set) filtered_matches = [] for match in matches: if len(filter_set.intersection(match)) == n: filtered_matches.append(match) return filtered_matches def _findMcsCores(self, sts): """ Finds the MCS between the reference structure and each of the supplied structures and stores the smarts in self._fuzzy_smarts, in the same order as sts. An empty string is stored for the reference structure as well as for any structure that fails to yield an MCS of at least 3 atoms. :param sts: The structures to be aligned. :type sts: list(structure.Structure) """ ref_pos = self._ref_index - 1 rdkit_mol_ref = rdkit_adapter.to_rdkit(sts[ref_pos], implicitH=True) chm_adaptor = canvas.ChmMmctAdaptor() stereo = canvas.ChmMmctAdaptor.StereoFromAnnotation_Safe chm_mol_ref = chm_adaptor.create(sts[ref_pos], stereo) chm_matcher = canvas.ChmQueryMatcher(True) for i, st in enumerate(sts): if i == ref_pos: self._fuzzy_smarts.append("") else: rdkit_mol = rdkit_adapter.to_rdkit(st, implicitH=True) smarts = rdFMCS.FindMCS( [rdkit_mol, rdkit_mol_ref], ringMatchesRingOnly=True, completeRingsOnly=True, atomCompare=Chem.rdFMCS.AtomCompare.CompareElements, bondCompare=Chem.rdFMCS.BondCompare.CompareOrder, ringCompare=Chem.rdFMCS.RingCompare.StrictRingFusion, timeout=10).smartsString # Ensure that the RDKit SMARTS is legal and matches at least # 3 atoms in the equivalent ChmMol, so that we know it's safe # to use the SMARTS for subsequent Canvas-based work. chm_mol = chm_adaptor.create(st, stereo) match_size = 0 try: chm_query = canvas.ChmQuery(smarts) for match in chm_matcher.match(chm_query, chm_mol): match_size = match.getMatchedAtoms().size() break except: pass if match_size < 3: smarts = "" self._fuzzy_smarts.append(smarts) def _getAlignCoreOptions(self, snapping_sidechains=False): """ Creates options for snapping core atoms. By default, close contacts, open rings, and changes in stereochemistry are prohibited, but a PhpException will not be thrown in the event of these failures, so that mappings can be retrieved for least-squares alignment of the core. If snapping_sidechains is True, open rings will be allowed, and the other types of failures will trigger an exception. Open rings are allowed when snapping sidechains because only the sidechain atoms will move, so it doesn't matter if a given ring is not fully mapped. :param snapping_sidechains: Whether sidechains are being snapped. :type snapping_sidechains: bool :return: Core snapping options. :rtype: phase.PhpAlignCoreOptions """ options = phase.PhpAlignCoreOptions() options.close_contact_tol = self._options.close_contact_tol if not snapping_sidechains: options.open_ring_policy = phase.PhpOpenRingPolicy_SKIP options.stereo_change_action = phase.PhpStereoChangeAction_SAVE_MAPPINGS options.close_contact_action = phase.PhpCloseContactAction_SAVE_MAPPINGS options.open_ring_action = phase.PhpOpenRingAction_SAVE_MAPPINGS return options def _getAnchorAtoms(self, st, ring_atoms): """ Returns a set containing 1-based atom numbers for all atoms from which a sidechain is allowed to be grown. This consists of ring atoms bonded to exactly one exocyclic heavy atom. :param st: Structure whose anchor atoms are sought. :type st: structure.Structure :param ring_atoms: 1-based set of all ring atom numbers in structure. :type ring_atoms: set(int) :return: Anchor atom numbers. :rtype: set(int) """ anchor_atoms = set() for ring_atom in ring_atoms: num_exocyclic = 0 for neighbor in st.atom[ring_atom].bonded_atoms: if neighbor.atomic_number != 1 and neighbor.index not in ring_atoms: num_exocyclic += 1 if num_exocyclic == 1: anchor_atoms.add(ring_atom) return anchor_atoms def _getBestAlignment(self, stA, stB, smarts=""): """ Finds the best flexible alignment of stB onto stB using one of the following treatments of stB: (1) Core snapping with flexible treatment beyond core. (2) Full flexible treatment with least-squares alignment of core. (3) Full flexible treatment with elemental shape-based alignment. Method 1 is used if a SMARTS is provided and the core can be snapped with no errors. Method 2 is used if core snapping results in errors. Method 3 is used if no SMARTS is provided. :param stA: The fixed reference structure :type stA: structure.Structure :param stB: The structure to be aligned onto stA :type stB: structure.Structure :param smarts: SMARTS string that represents the core :type smarts: str :return: Aligned version of stB. :rtype: structure.Structure """ align_method = AlignMethod.snap_core align_method_descr = ALIGN_METHOD_DESCRIPTION[align_method] align_reason = "" snappedB = [] core_mappings = [] if smarts: align_core_options = self._getAlignCoreOptions() try: aligner = phase.PhpAlignCore(stA.handle, stB.handle, smarts, align_core_options) # We've asked that mappings be saved even if snapping fails. core_mappings = aligner.getMappings() # Check for failures. if aligner.closeContactFailure(): align_reason = "creation of a close contact" elif aligner.openRingFailure(): align_reason = "open ring in core mapping" elif aligner.stereoFailure(): align_reason = "stereochemistry change" if align_reason: align_method = AlignMethod.least_squares else: # Get snapped core alignment for each mapping. snappedB = [ structure.Structure(ct) for ct in aligner.getAlignedB() ] # Keep only alignments without distorted bonds. align_keep = self._removeDistoredStructures(stB, snappedB) if align_keep: snappedB = [snappedB[i] for i in align_keep] core_mappings = [core_mappings[i] for i in align_keep] else: align_method = AlignMethod.least_squares align_reason = "distorted bond in snapped structure" except phase.PhpException as exc: # An unknown type of failure. We don't have mappings, so we # have to do flexible shape-based alignment. self._logger.debug(f"Core snapping failed: {exc}") align_method = AlignMethod.shape_based align_reason = "unknown core snapping failure" else: # No core SMARTS available for this pair. align_method = AlignMethod.shape_based if not self._core: align_reason = "no shared scaffolds" elif self._core == MCS_CORES: align_reason = "no MCS with reference" else: align_reason = "no SMARTS matches" if align_reason: align_method_descr = ALIGN_METHOD_DESCRIPTION[align_method] mesg = "Performing {} alignment due to {}" self._logger.info(mesg.format(align_method_descr, align_reason)) stB_aligned = None if align_method == AlignMethod.snap_core: stB_aligned = self._alignSnappedCore(stA, snappedB, core_mappings) elif align_method == AlignMethod.least_squares: stB_aligned = self._alignLeastSquares(stA, stB, core_mappings) else: stB_aligned = self._alignShapes(stA, stB) stB_aligned.property[ALIGN_METHOD_PROP] = align_method_descr stB_aligned.property[ALIGN_REASON_PROP] = align_reason return stB_aligned def _getFrozenAtoms(self, stB, core_mapping): """ Creates a 1-based list of atoms in stB to be held frozen while generating conformers. :param stB: Structure containing atoms to freeze. :type stB: structure.Structure :param core_mapping: Pairs of 1-based B->A core atom mappings. :type core_mapping: list(int, int)) :return: List of frozen atoms. :rtype: list(int) """ frozen_atoms = [atom_pair[0] for atom_pair in core_mapping] if not self._options.minimize_confs: # Expand list to include terminal atoms attached to frozen_atoms # because they may end up with weird placement otherwise. frozen_set = set(frozen_atoms) n = len(frozen_atoms) for i in range(n): for neighbor in stB.atom[frozen_atoms[i]].bonded_atoms: if neighbor.bond_total == 1 and neighbor not in frozen_set: frozen_atoms.append(neighbor.index) return frozen_atoms def _getMcsMatrix(self): """ Surrogate for _getScaffoldMatrix when MCS-based cores are in use. The [i][ref] and [ref][i] elements are set to i if structure i yielded an MCS with the reference structure. Other elements are set to -1. :return: n by n matrix with values as described above. :rtype: numpy.ndarray """ nmol = len(self._mols) mcs_matrix = numpy.full((nmol, nmol), -1) ref_pos = self._ref_index - 1 for i in range(nmol): if self._fuzzy_smarts[i]: mcs_matrix[i][ref_pos] = i mcs_matrix[ref_pos][i] = i return mcs_matrix def _getScaffoldMatrix(self): """ Determines the largest scaffold shared by each pair of structures. This can be thought of as a distance matrix because a value of 0 means that the two structures share the largest identified scaffold and thus share as much of their scaffold backbone as possible. Larger scaffold numbers correspond to progressively smaller shared scaffolds and hence larger distances. A distance of self._scaffold_count indicates the structures share no scaffolds. :return: n by n matrix with scaffold numbers in off-diagonal positions, where n is the number of structures. :rtype: numpy.ndarray """ nmol = len(self._mols) nscaff = self._scaffold_count scaffold_matrix = numpy.full((nmol, nmol), nscaff) for i, scaffolds_i in enumerate(self._scaffolds_in_mol): if scaffolds_i: scaffold_matrix[i][i] = min(scaffolds_i) for j in range(i): scaffolds_j = self._scaffolds_in_mol[j] shared_ij = scaffolds_i.intersection(scaffolds_j) if shared_ij: dij = min(shared_ij) scaffold_matrix[i][j] = dij scaffold_matrix[j][i] = dij return scaffold_matrix def _getSmartsMatrix(self): """ Surrogate for _getScaffoldMatrix when a core SMARTS has been supplied. Off-diagonal elements are set to 0 if one of the structures is the reference and the other structure matches the core SMARTS. Other elements are set to 1. :return: n by n matrix with 0/1 values as described above, where n is the number of structures. :rtype: numpy.ndarray """ nmol = len(self._mols) smarts_matrix = numpy.ones((nmol, nmol), int) query = canvas.ChmQuery(self._core) matcher = canvas.ChmQueryMatcher(True) ref_pos = self._ref_index - 1 for i, mol in enumerate(self._mols): if i != ref_pos: if matcher.hasMatch(query, mol): smarts_matrix[i][ref_pos] = 0 smarts_matrix[ref_pos][i] = 0 return smarts_matrix def _getShape(self, st_ref): """ Creates a PhpFastShape object to score alignments against the provided reference structure. :param st_ref: Reference structure. :type st_ref: structure.Structure :return: PhpFastShape object. :rtype: phase.PhpFastShape """ atom_types = "element" hydrogens = phase.HYDROGENS_ALL weight_prop = "" return phase.PhpFastShape(st_ref.handle, hydrogens, atom_types, weight_prop) def _getSideChain(self, anchor_atom, st, ring_atoms): """ Retrieves a sidechain connected to the indicated anchor atom from a stored dictionary or grows it from scratch. The returned list of sidechain atoms will be [-1] if a chain cannot be grown or if the chain has been removed from the pool because it was already snapped. :param anchor_atom: 1-based atom number from which the sidechain originates. This atom is not included in the sidechain. :type anchor_atom: int :param st: The structure. :type st: structure.Structure :param ring_atoms: 1-based set of ring atom numbers in the structure. :type ring_atoms: set(int) :return: 1-based sidechain atom numbers followed by SMARTS. :rtype: list(int), str """ st_number = st.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 atoms = self._sidechain_atoms[st_number][anchor_atom] smarts = self._sidechain_smarts[st_number][anchor_atom] if not atoms: atoms, smarts = self._growSideChain(st, anchor_atom, ring_atoms) if not atoms: atoms = [-1] self._sidechain_atoms[st_number][anchor_atom] = atoms self._sidechain_smarts[st_number][anchor_atom] = smarts return atoms, smarts def _growCore(self, st_i, st_j, core_i, core_j): """ Attempts to expand the provided lists of core atoms by adding pairs of heavy atoms which have the same coordinates and which are bonded to atoms already in the core. The lists of core atoms are modified directly. :param st_i: Structure associated with core_i. :type st_i: structure.Structure :param st_j: Structure associated with core_j. :type st_j: structure.Structure :param core_i: 1-based list of core atom numbers in st_i. :type core_i: list(int) :param core_j: 1-based list of core atom numbers in st_j. :type core_j: list(int) """ excluded_i = (st_i.atom_total + 1) * [0] for i in core_i: excluded_i[i] = 1 excluded_j = (st_j.atom_total + 1) * [0] for j in core_j: excluded_j[j] = 1 while True: # Look for atoms with the same coordinates bonded to current core. len_prev = len(core_i) for atom_pos in range(len_prev): atom_i = core_i[atom_pos] atom_j = core_j[atom_pos] for bi in st_i.atom[atom_i].bonded_atoms: if excluded_i[bi.index] or bi.atomic_number == 1: continue xyzi = (bi.x, bi.y, bi.z) for bj in st_j.atom[atom_j].bonded_atoms: if excluded_j[bj.index] or bj.atomic_number == 1: continue xyzj = (bj.x, bj.y, bj.z) match = True for coord_i, coord_j in zip(xyzi, xyzj): if abs(coord_i - coord_j) > ATOM_COORD_TOL: match = False break if match: core_i.append(bi.index) core_j.append(bj.index) excluded_i[bi.index] = 1 excluded_j[bj.index] = 1 if len(core_i) == len_prev: break def _growSideChain(self, st, anchor_atom, ring_atoms): """ Attempts to grow a sidechain attached to anchor_atom, which is assumed to be a ring atom in the core with exactly one attached exocyclic heavy atom. Returns the list of heavy atoms in the sidechain followed by the the corresponding SMARTS string. An empty list and string are returned if the chain leads to another ring. :param st: Structure for which the chain is to be grown. :type st: structure.Structure :param anchor_atom: 1-based atom number of chain attachment point. :type anchor_atom: int :param ring_atoms: 1-based set of all ring atom numbers in structure. :type ring_atoms: set(int) :return: 1-based atom numbers and SMARTS string for sidechain. :rtype: list(int), str """ excluded = (st.atom_total + 1) * [0] excluded[anchor_atom] = 1 sidechain_atoms = [] # Find first atom in sidechain. for neighbor in st.atom[anchor_atom].bonded_atoms: index = neighbor.index if index not in ring_atoms and neighbor.atomic_number != 1: sidechain_atoms.append(neighbor) excluded[index] = 1 # Grow sidechain from first atom, traversing all branches. while True: len_prev = len(sidechain_atoms) for i in range(len_prev): atom = sidechain_atoms[i] for neighbor in atom.bonded_atoms: index = neighbor.index if not excluded[index]: if index in ring_atoms: return [], "" elif neighbor.atomic_number != 1: sidechain_atoms.append(neighbor) excluded[index] = 1 if len(sidechain_atoms) == len_prev: break smarts = analyze.generate_smarts_canvas(st, sidechain_atoms) return [atom.index for atom in sidechain_atoms], smarts def _initAlignData(self, ref_index, core): """ Initializes all member level data that gets regenerated with each call to the align() function. :param ref_index: 1-based reference structure number. :type ref_index: int :param core: A SMARTS string, MCS_CORES or empty. :type core: str """ self._ref_index = ref_index self._core = core self._anchor_atom_sets = [] self._clusters = {} self._fuzzy_smarts = [] self._mols = [] self._mst_edges = [] self._ring_atom_sets = [] self._scaffold_count = 0 self._scaffolds_in_mol = [] self._sidechain_atoms = [] self._sidechain_smarts = [] self._snapped_core_mappings = {} self._least_squares_core_mappings = {} self._ring_atom_sets = [] self._frozen_refinement_atoms = {} def _initSideChains(self): """ Initializes dictionaries of sidechain atoms and SMARTS that are used to avoid repetitive re-growing of sidechains from the same anchor atoms. """ # Loop over anchor atoms in each structure. for anchor_atoms in self._anchor_atom_sets: sidechain_atoms = {} sidechain_smarts = {} for atom in anchor_atoms: sidechain_atoms[atom] = [] sidechain_smarts[atom] = "" self._sidechain_atoms.append(sidechain_atoms) self._sidechain_smarts.append(sidechain_smarts) def _matchCoreAtoms(self, st_i, st_j): """ Identifies the common core for a pair of structures from coordinates of snapped atoms. :param st_i: First structure in pair. :type st_i: structure.Structure :param st_j Second structure in pair. :type st_j: structure.Structure :return: Lists of 1-based core atom numbers for structures i and j. :rtype: list(int), list(int) """ i = st_i.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 j = st_j.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 core_atoms_i = [] core_atoms_j = [] if self._core_atom_coords[i] and self._core_atom_coords[j]: for row_i in self._core_atom_coords[i]: for row_j in self._core_atom_coords[j]: match = True for k in range(1, 4): if abs(row_i[k] - row_j[k]) > ATOM_COORD_TOL: match = False break if match: core_atoms_i.append(row_i[0]) core_atoms_j.append(row_j[0]) if core_atoms_i: # Try to add coincident atoms that lie outside the defined # cores. self._growCore(st_i, st_j, core_atoms_i, core_atoms_j) return core_atoms_i, core_atoms_j def _matchSideChains(self, stA, stB, anchorA, anchorB): """ Returns all B-->A atom mappings for the sidechains attached to the indicated anchor atoms. No mappings are returned if the side chains don't exist or they are not equivalent. :param stA: The fixed reference structure. :type stA: structure.Structure :param stB: The candidate structure to be aligned to the reference. :type stB: structure.Structure :param anchorA: 1-based atom number of anchor atom in stA. :type anchorA: int :param anchorB: 1-based atom number of anchor atom in stB. :type anchorB: int :return: lists of B-->A atom mappings. :rtype: list(list((int, int))) """ ringsA = self._ring_atom_sets[stA.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1] atomsA, smartsA = self._getSideChain(anchorA, stA, ringsA) if atomsA == [-1]: return [] ringsB = self._ring_atom_sets[stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1] atomsB, smartsB = self._getSideChain(anchorB, stB, ringsB) if atomsB == [-1] or len(atomsB) != len(atomsA): return [] # At this point we know that the sidechains contain the same # numbers of atoms, but we don't know that they're equivalent. # And we can't assume the sidechains are the same/different simply # because the SMARTS are the same/different. The only way to know # for certain is to pick one SMARTS and match it against both # sidechains. matchesA = analyze.evaluate_smarts_canvas(stA, smartsB, uniqueFilter=False) # Filter out matches that don't consist of the atoms in atomsA. filteredA = self._filterMatches(matchesA, atomsA) if not filteredA: return [] # We only need one mapping to stB because we already requested # all mappings to stA. matchesB = analyze.evaluate_smarts_canvas(stB, smartsB, uniqueFilter=True) filteredB = self._filterMatches(matchesB, atomsB) core_mappings = [] firstA = atomsA[0] firstB = atomsB[0] # Keep only mappings for which firstA and firstB appear in the same # position in their respective lists. These are the sidechain atoms # that are bonded directly to the rings, so they must be mapped to # each other. for matchB in filteredB: indexB = matchB.index(firstB) for matchA in filteredA: if matchA.index(firstA) == indexB: core_mappings.append(list(zip(matchB, matchA))) return core_mappings def _preparePrimaryRef(self, st_ref): """ Prepares the primary reference structure, which includes finding the most similar conformer if the option to use a primary reference conformer is True. :param st_ref: Input primary reference structure. :type st_ref: structure.Structure :return: The incoming reference structure or most similar conformer, with the full set of CT-level properties. :rtype: structure.Structure """ st_number = st_ref.property[OUTPUT_STRUCTURE_NUMBER_PROP] self._logger.info(f"\nPrimary reference = structure {st_number}") st_ref.property[SHAPE_SIM_PROP] = 1.0 align_method = "" align_reason = "" if self._options.use_sampled_ref: mesg = "Replacing primary reference structure with most " + \ "similar conformer" self._logger.info(mesg) st_ref = self._alignShapes(st_ref, st_ref, append_confs=False) align_method = ALIGN_METHOD_DESCRIPTION[AlignMethod.shape_based] align_reason = "primary reference conformer requested" sim_ref = st_ref.property[SHAPE_SIM_PROP] self._logger.info("Conformer shape similarity = %.6f" % sim_ref) st_ref.property[REFERENCE_STRUCTURE_PROP] = st_number st_ref.property[ALIGN_METHOD_PROP] = align_method st_ref.property[ALIGN_REASON_PROP] = align_reason st_ref.property[CORE_SMARTS_PROP] = "" return st_ref def _refineAlignments(self, sts): """ Generates additional conformers for each structure that was snapped to its reference, with core atoms and snapped sidechains held fixed, and uses those conformers to systematically increase the average in-place shape similarity over all pairs of structures. :param sts: The aligned structures to be refined. :type sts: list(structure.Structure) :return: Refined alignments in the same order as the input structures. :rtype: list(structure.Structure) """ refiner = phase.PhpAlignmentRefiner() conf_options = phase.PhpConfOptions() if self._options.sampling_method == phase.CONF_SAMPLE_FINE_NAME: conf_options.setSamplingMethod(phase.CONF_SAMPLE_FINE) conf_options.setMaxConfs(self._options.max_confs) if self._options.minimize_confs: conf_options.setForceField(phase.CONF_FF_OPLS_2005) refiner.setConfOptions(conf_options) refiner.setMaxIterations(len(sts)) # Number completed is usually small if self._logger.getEffectiveLevel() == log.DEBUG: refiner.setVerboseOutput(True) frozen_atoms = [] refined_set = set() for imol in range(len(sts)): frozen = [] if imol in self._frozen_refinement_atoms: frozen = list(self._frozen_refinement_atoms[imol]) refined_set.add(imol) frozen_atoms.append(frozen) self._logger.info("\nRefining alignments of snapped structures") sts_refined = [ structure.Structure(ct) for ct in refiner.refine(sts, frozen_atoms) ] # Need to update SHAPE_SIM_PROP. self._updateSimilarities(sts_refined, refined_set) return sts_refined def _removeDistoredStructures(self, stB, snappedB, bond_tol=DISTORTED_BOND_TOL): """ Identifies structures in snappedB with a bond length that differs by more than bond_tol from the corresponding bond length in stB. Returns a 0-based list of the structures that contain no such bonds. :param stB: The original structure. :type stB: structure.Structure :param snappedB: List of snapped versions of stB which may contain distored bonds. :type snappedB: list(structure.Structure) :return: 0-based list of structures with no distorted bonds. :rtype: list(int) """ sts_to_keep = [] for i, st_snapped in enumerate(snappedB): distorted_bond = False for bond_ref, bond_snapped in zip(stB.bond, st_snapped.bond): if abs(bond_ref.length - bond_snapped.length) > bond_tol: distorted_bond = True break if not distorted_bond: sts_to_keep.append(i) return sts_to_keep def _setupAlign(self, sts): """ Does setup for alignment of the provided structures, which includes checking for multiple fragments, the absence of rings, and RDKit conversion failures if MCS cores are being used. Filters out such structures if the option to skip bad structures is True. Raises a RuntimeError otherwise. :param sts: The structures to be aligned. :type sts: list(structure.Structure) :return: Copies of the incoming structures, minus any bad structures, and ChmMol objects created from the usable structures :rtype: list(structure.Structure), list(ChmMol) """ adaptor = ChmMmctAdaptor() mols = [] keep = [] fragmented = [] no_rings = [] rdkit_conversion_error = [] # Don't require rings to be present if a core has been specified. require_rings = not self._core # RDKit conversion errors can occur only if using MCS cores. attempt_rdkit_conversion = self._core == MCS_CORES for i, st in enumerate(sts): if len(st.molecule) > 1: fragmented.append(i) elif attempt_rdkit_conversion and self._failsRDKitConversion(st): rdkit_conversion_error.append(i) else: mol = adaptor.create(st, ChmMmctAdaptor.StereoFromAnnotation_Safe) if require_rings and not mol.getRingCount(): no_rings.append(i) else: mols.append(mol) keep.append(i) if self._options.fail_on_bad: if fragmented: fs = ", ".join(str(i + 1) for i in fragmented) msg = ("Found multiple disconnected fragments in the " f"following structures: {fs}") raise RuntimeError(msg) if no_rings: nr = ", ".join(str(i + 1) for i in no_rings) msg = f"Found no rings in the following structures: {nr}" raise RuntimeError(msg) if rdkit_conversion_error: rce = ", ".join(str(i + 1) for i in rdkit_conversion_error) msg = ("RDKit conversion failed for the following structures: " f"{rce}") raise RuntimeError(msg) # Make copies of the valid structures and add properties that hold # the input and output structure numbers. For example, if there are # 10 input structures and structures 3 and 7 are removed, the 8 # returned structures will have input structure numbers 1, 2, 4, 5, # 6, 8, 9, 10 and output structure numbers 1, 2, 3, 4, 5, 6, 7, 8. sts_out = [] # Reference structure number may shift or it could be lost. found_ref = False for number_out, number_in in enumerate(keep): in_plus_1 = number_in + 1 out_plus_1 = number_out + 1 if self._ref_index and in_plus_1 == self._ref_index: self._ref_index = out_plus_1 found_ref = True st_out = sts[number_in].copy() st_out.property[INPUT_STRUCTURE_NUMBER_PROP] = in_plus_1 st_out.property[OUTPUT_STRUCTURE_NUMBER_PROP] = out_plus_1 sts_out.append(st_out) if self._ref_index and not found_ref: ref_index = self._ref_index - 1 if ref_index in fragmented: msg = "Reference structure contains multiple fragments" elif ref_index in no_rings: msg = "Reference structure contains no rings" else: msg = "RDKit conversion failed for reference structure" raise RuntimeError(msg) return sts_out, mols def _snapAndScore(self, stA, stB, core_mappings): """ Snaps stB to stA according to the B-->A atom mappings in core_mappings and identifies the snapped version of stB which yields the highest shape similarity to stA. If all attempts to snap the core fail, the original stB is returned. If refinement is turned on, the stB atoms from the best mapping are added to self._frozen_refinement_atoms. The stA atoms are added as long as stA isn't the primary or designated reference structure. :param stA: The fixed reference structure. :type stA: structure.Structure :param stB: The structure to be snapped onto stA. :type stB: structure.Structure :param core_mappings: Lists of 1-based B-->A atom mappings that define the various ways the core of stB is to be snapped to stA. :type core_mappings: list(list((int, int))) :return: A boolean indicating whether stB was successfully snapped, and the highest scoring snapped version of stB, or the incoming stB if snapping failed. :rtype: bool, structure.Structure """ shape = self._getShape(stA) inplace = True align_core_options = self._getAlignCoreOptions(True) try: aligner = phase.PhpAlignCore(stA.handle, stB.handle, core_mappings, align_core_options) stB_snapped = [ structure.Structure(ct) for ct in aligner.getAlignedB() ] mappings = aligner.getMappings() sim_best = 0.0 stB_snapped_best = None mapping_best = None first_conf = True for st, mapping in zip(stB_snapped, mappings): if self._options.align_terminal_hydrogens: phase.align_terminal_hydrogens(stA, st, mapping) result = shape.computeShapeSimPy(st.handle, first_conf, inplace) first_conf = False if result[0] > sim_best: sim_best = result[0] stB_snapped_best = st mapping_best = mapping if self._options.refine: stA_number = stA.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 stB_number = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1 addA = stA_number != self._mst_edges[0][0] for atomB, atomA in mapping_best: self._frozen_refinement_atoms[stB_number].add(atomB) if addA: self._frozen_refinement_atoms[stA_number].add(atomA) return True, stB_snapped_best except phase.PhpException as exc: iA = stA.property[OUTPUT_STRUCTURE_NUMBER_PROP] iB = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] self._logger.debug( f"Sidechain snapping failed for {iB}-->{iA}: {exc}") return False, stB def _snapSideChains(self, sts_aligned): """ Attempts to improve snapped core alignments by expanding cores to include sidechains that are common to two or more structures. In this treatment we don't use fuzzy matching, i.e., the sidechains must be chemically identical. :param sts_aligned: All structures from the initial alignment phase. :type sts_aligned: list(structure.Structure) :return: Re-snapped structures. If a structure doesn't get re-snapped, the incoming structure is returned. :rtype: list(structure.Structure) """ self._logger.info("\nAttempting to superimpose identical sidechains") # For each snapped structure, store the coordinates of each snapped # atom. We will use this data to determine the common core for each # pair of snapped structures, even if that pair was never directly # snapped onto one another. self._storeCoreAtomCoords(sts_aligned) # Compile sets of ring atoms and anchor atoms, the latter being # potential attachment points for sidechains. self._storeRingsAndAnchors(sts_aligned) # Avoid repetitive re-growing of sidechains by storing them in # dictionaries keyed by anchor atom. self._initSideChains() # Do all applicable pairwise comparisons in canonical order and re-snap # any common sidechains. sts_resnapped = [st.copy() for st in sts_aligned] resnapped_set = set() n = len(sts_aligned) for i in range(1, n): imol = self._canonical_order[i] anchor_atoms_i = self._anchor_atom_sets[imol] for j in range(i): st_i = sts_resnapped[imol] jmol = self._canonical_order[j] st_j = sts_resnapped[jmol] core_atoms_i, core_atoms_j = self._matchCoreAtoms(st_i, st_j) # Skip this pair if the core is too small or disconnected. if self._badCore(st_i, st_j, core_atoms_i, core_atoms_j): continue core_mapping = list(zip(core_atoms_i, core_atoms_j)) anchor_atoms_j = self._anchor_atom_sets[jmol] # Attempt to grow and snap sidechains attached to each anchor # atom in the respective common cores. for atom_i, atom_j in zip(core_atoms_i, core_atoms_j): if atom_i in anchor_atoms_i and atom_j in anchor_atoms_j: sidechain_mappings = self._matchSideChains( st_j, st_i, atom_j, atom_i) if not sidechain_mappings: continue new_core_mappings = [] for mapping in sidechain_mappings: # It's possible that _matchCoreAtoms already added # one or more atoms from this sidechain simply # because they were already coincident, so don't # add any atoms that are already in the core. new_core = list(core_mapping) for atom_pair in mapping: if atom_pair not in core_mapping: new_core.append(atom_pair) new_core_mappings.append(new_core) smarts = self._sidechain_smarts[imol][atom_i] mesg = "Attempting to re-snap {}-->{} with " + \ "sidechain {} attached to atoms {} and " + \ "{}\nCore mappings = {}" self._logger.debug( mesg.format(imol + 1, jmol + 1, smarts, atom_i, atom_j, new_core_mappings)) # If refinement is turned on, a successful call to # self._snapAndScore adds the st_i sidechain atoms to # self._frozen_refinement_atoms. The st_j sidechain # atoms are added if it's not the primary reference. success, snapped_i = self._snapAndScore( st_j, st_i, new_core_mappings) if success: mesg = "Successfully re-snapped {}-->{}: " + \ "Sidechain = {}, anchor atoms = {}, {}" self._logger.info( mesg.format(imol + 1, jmol + 1, smarts, atom_i, atom_j)) st_i = snapped_i sts_resnapped[imol] = snapped_i resnapped_set.add(imol) # Don't snap this sidechain again. self._sidechain_atoms[imol][atom_i] = [-1] if resnapped_set: self._updateSimilarities(sts_resnapped, resnapped_set) return sts_resnapped def _storeCoreAtomCoords(self, sts_aligned): """ For each snapped structure, this function stores the coordinates of each snapped atom. :param sts_aligned: Aligned structures. :type sts_aligned: list(structure.Structure) """ self._core_atom_coords = len(sts_aligned) * [[]] for i, st in enumerate(sts_aligned): if i in self._snapped_core_mappings: atom_coords = [] core_mapping = self._snapped_core_mappings[i] core_atoms = [atom_pair[0] for atom_pair in core_mapping] for core_atom in core_atoms: atom = st.atom[core_atom] atom_coords.append((core_atom, atom.x, atom.y, atom.z)) self._core_atom_coords[i] = atom_coords # The primary reference was never snapped to anything else, so just # store coordinates of all heavy atoms. iref = self._canonical_order[0] st_ref = sts_aligned[iref] atom_coords = [] for atom in st_ref.atom: if atom.atomic_number != 1: atom_coords.append((atom.index, atom.x, atom.y, atom.z)) self._core_atom_coords[iref] = atom_coords def _storeFuzzyScaffoldsInMol(self): """ Matches the fuzzy scaffold SMARTS against each structure and stores the scaffold index for each SMARTS that yields a match. """ queries = [canvas.ChmQuery(smarts) for smarts in self._fuzzy_smarts] matcher = canvas.ChmQueryMatcher(True) for mol in self._mols: scaffolds = set() for i, query in enumerate(queries): if matcher.hasMatch(query, mol): scaffolds.add(i) self._scaffolds_in_mol.append(scaffolds) def _storeRingsAndAnchors(self, sts): """ Compiles sets of ring atoms and anchor atoms for the provided structures. :param sts: The structures. :type sts: list(structure.Structure) """ for st in sts: ring_atoms = set(itertools.chain(*st.find_rings())) self._ring_atom_sets.append(ring_atoms) self._anchor_atom_sets.append(self._getAnchorAtoms(st, ring_atoms)) def _updateSimilarities(self, sts, subset): """ For each structure number in subset, the in-place shape similarity to the corresponding reference structure is recalculated to reflect snapping of sidechains or refinement, and the property SHAPE_SIM_PROP is updated. It's worth noting that shape similarity to the reference doesn't always increase because snapping of sidechains and refinement aren't solely focused on increasing the similarity to the reference. They are instead concerned with increasing the similarity between any number of pairs of structures. :param sts: All structures in the analysis. The SHAPE_SIM_PROP is overwritten for the affected subset. :type sts: list(structure.Structure) :param subset: 0-based set of structure numbers to operate on. :type subset: set(int) """ mesg = "\nRecalculating shape similarities to reference structure" self._logger.info(mesg) inplace = True first_conf = True ref_prev = None shape = None for i in sorted(subset): stB = sts[i] ref = stB.property[REFERENCE_STRUCTURE_PROP] - 1 if ref != ref_prev: shape = self._getShape(sts[ref]) ref_prev = ref sim_old = sts[i].property[SHAPE_SIM_PROP] sim_new = shape.computeShapeSimPy(stB, first_conf, inplace)[0] sts[i].property[SHAPE_SIM_PROP] = sim_new mesg = "Structure %d: Old similarity = %.6f, new similarity = " + \ "%.6f" self._logger.info(mesg % (i + 1, sim_old, sim_new))