Source code for schrodinger.application.combinatorial_screen.combinatorial_screener

"""
This module contains the CombinatorialScreener class, which employs a
heuristic approach to identify subsets of combinatorial reactants that
are most likely to yield enumerated products with the highest dendritic
fingerprint similarities to a query.

Basic Algorithm
---------------

For the example reaction A + B + C --> ABC, the reactants in each of the 3
groups are ranked by decreasing Tversky similarity to the query, where the
Tversky weight for the reactant R is 1 and the Tversky weight for the query
Q is 0. In other words,

rank_score(R, Q) = ON(R & Q) / ON(R)

Where:

ON(R & Q) = Number of 'on' bits shared by reactant and query
ON(R) = Number of 'on' bits in reactant

This quantity is maximized when R is a substructure of Q.

Once the reactants have been ranked, limits NA, NB, NC on the ranked lists
are assigned to yield the subset S(NA, NB, NC) = A[0:NA] x B[0:NB] x C[0:NC],
where [0:N] is a Python slice.

If the minimum number of enumerated products desired is min_products, the
limits must be chosen such that NA * NB * NC >= min_products.

NA, NB, NC are arrived by setting them to 1 and then performing a systematic
exploration of larger values with the goal of identifying combinations of
reactants whose logical OR fingerprints yield the highest similarities to
the query. In the case of dendritic fingerprints, the logical OR similarities
correlate strongly with similarities computed from the enumerated products,
so this is a good approximation that avoids enumeration of S(NA, NB, NC) as
the limits are varied.

A rough outline of the procedure is as follows::

    1. Set NA = NB = NC = 1
    2. if NA * NB * NC >= min_products, we are done
    3. sim_best = 0, R_best = None
    4. for R in (A, B, C):
           NR += 1  # Temporarily add new reactant R_new
           for each (a, b, c) in S(NA, NB, NC), where R_new is in (a, b, c)
               FP_abc = FP(a) | FP(b) | FP(c)  # Logical OR fingerprint
               sim = Tanimoto(FP_abc, FP_query)
               if sim > sim_best:
                   sim_best = sim
                   R_best = R
            NR -= 1  # Remove new reactant
    5. NR_best += 1  # Expand limit to include best new reactant
    6. Go to step 2

This approach is superior to assigning equal limits, such as (10, 10, 10)
if 1000 products are desired. In many cases, the algorithm finds limits that
are quite ragged, such as (2, 50, 10), and the enumerated compound with the
highest similarity to the query is found at some non-obvious position, such
as (1, 45, 7).

Copyright Schrodinger LLC, All Rights Reserved.
"""

import itertools

import numpy

from schrodinger.application.combinatorial_screen import \
    fingerprint_utils as fp_utils
from schrodinger.infra import canvas
from schrodinger.infra import phase

# MAX_COMBOS is used to assign a default upper bound on the size of each of
# the N reactant subsets:
#
# max_reactants = MAX_COMBOS ** (1/N)
#
# For example, if there are 3 reactants, the upper bound would be 100, which
# means the reactant subset limits would top out at (100, 100, 100). This is
# done to force the exploration of all dimensions of the reactant subspace in
# cases where there's a natural tendency to keep one or more dimensions very
# small. So instead of getting limits like (1, 320, 313) when 100,000 products
# are requested, we would get limits like (10, 100, 100). The advantages of
# of this approach are that it prevents step 4 in the above algorithm from
# becoming painfully slow, and it tends to increase the number of successfully
# enumerated products for the same total number of reactant combinations.
MAX_COMBOS = 1000000


[docs]class CombinatorialScreener: """ Identifies subsets of combinatorial reactants that are most likely to yield products with the highest dendritic Tanimoto similarities to a query. """
[docs] def __init__(self, reactant_fp_files, query_smiles, max_reactants=None, reactant_classes=None): """ Constructor taking the names of dendritic fingerprint files for one or more classes of reactants, the SMILES string for a query, an optional cap on the reactant subset size and, if supplying a multi- reactant fingerprint file created by RfpDatabase, the names of the reactant classes within that file to utilize. If supplying a separate Canvas fingerprint file for each reactant class, those files must contain a single extra data column that holds the SMILES of the reactants. If the same fingerprint file is supplied more than once, each instance is treated as a separate reactant class, but the file is read only once, and all screens will yield identical reactant subsets for all instances of that reactant class. If supplying a single fingerprint file created by RfpDatabase, the format is guaranteed to be compatible with this class, and the reactant class names play a role that's analogous to the different Canvas fingerprint file names in the previous use case. Example usage with multiple Canvas fingerprint files:: reactant_fp_files = ['alcohols.fp', 'halides-aryl.fp', 'thiols.fp'] query_smiles = 'c1ccccc1Nc(nc(c23)[nH]cn3)nc2-c4cc(ccc4)-c5ccccc5' screener = CombinatorialScreener(reactant_fp_files, query_smiles) Example usage with a single RfpDatabase fingerprint file:: reactant_fp_file = 'all_reactants.rfpdb' query_smiles = 'c1ccccc1Nc(nc(c23)[nH]cn3)nc2-c4cc(ccc4)-c5ccccc5' reactant_classes = ['alcohols', 'halides-aryl', 'thiols'] screener = CombinatorialScreener([reactant_fingerprint_file], query_smiles, reactant_classes=reactant_classes) :param reactant_fp_files: List of reactant fingerprint files. :type reactant_fp_files: list[str] :param query_smiles: SMILES string for query. :type query_smiles: str :param max_reactants: Maximum allowed size of each reactant subset when a query is screened. The default is MAX_COMBOS ** 1/N, where N is the number of reactant groups. :type max_reactants: int :param reactant_classes: Names of reactant classes if using a single RfpDatabase fingerprint file. :type reactant_classes: list[str] """ nr = len(reactant_fp_files) if nr == 0: raise RuntimeError("No reactant fingerprint files supplied") if nr == 1 and reactant_classes: file_format = phase.get_phase_file_format(reactant_fp_files[0]) if file_format != phase.PhpFileFormat_PHP_FORMAT_RFPDB: msg = ('Illegal reactant fingerprint database file name: ' f'"{reactant_fp_files[0]}". Extension must be ' f'"{phase.PHASE_RFPDB_FILE_EXT}".') raise RuntimeError(msg) nr = len(reactant_classes) elif nr > 1 and reactant_classes: msg = ('Multiple fingerprint files may not be supplied when ' 'reactant classes are specified') raise RuntimeError(msg) self.reactant_count = nr self._query_fp = fp_utils.smiles_to_fingerprint(query_smiles) if max_reactants is not None: if max_reactants < 1: raise ValueError("Maximum number of reactants must be >= 1") else: max_reactants = MAX_COMBOS**(1 / self.reactant_count) self.max_reactants = int(round(max_reactants)) self.reactant_classes = reactant_classes self.reactant_fps = nr * [None] self.reactant_titles = nr * [None] self.reactant_smiles = nr * [None] self.reactant_indices = nr * [None] # 0-based reactant numbers self.max_reactants_reached = False if reactant_classes: self._rank_reactants_in_rfpdb_file(reactant_fp_files[0], reactant_classes) else: self._rank_reactants_in_canvas_files(reactant_fp_files)
[docs] def screen(self, min_products): """ Performs a similarity screen against the query to determine the number of reactants in each sorted group that are required to make the minimum number of enumerated products. These reactant counts are stored in self.reactant_limits. :param min_products: Minimum number of theoretical products :type min_products: int """ if min_products > self.max_products: raise ValueError("Too many products requested") self.max_reactants_reached = False self.reactant_limits = self.reactant_count * [1] while True: if numpy.prod(self.reactant_limits) >= min_products: break if not self._add_best_reactant(): self.max_reactants_reached = True break
def _add_best_reactant(self): """ Increments the reactant limit which yields the combination of reactants with the highest logical OR similarity to the query. Returns True if a reactant was added and False if a limit was reached that prevented a reactant from being added. :return: True if a reactant was added; False if not :rtype: bool """ sim_best = 0.0 r_best = None nr = self.reactant_count for r in range(nr): if self.reactant_limits[r] == len(self.reactant_fps[r]): # No more reactants left in this group. continue if self.reactant_limits[r] >= self.max_reactants: # Reached the self-imposed upper bound on this group. continue pools = [] for i in range(nr): ilimit = self.reactant_limits[i] if i == r: pools.append([ilimit]) # New reactant else: pools.append(list(range(0, ilimit))) # Other reactants # Consider all resulting combinations of ranked reactants. for combo in itertools.product(*pools): sim = fp_utils.get_reactant_combo_sim(self._query_fp, self.reactant_fps, combo) if sim > sim_best: sim_best = sim r_best = r if r_best is not None: self.reactant_limits[r_best] += 1 # Increment any duplicates of this reactant to preserve symmetry. for r_dup in self._reactant_equiv[r_best]: self.reactant_limits[r_dup] += 1 return r_best is not None def _rank_reactants_in_canvas_files(self, reactant_fp_files): """ Ranks reactants by Tversky similarity to query when reactants are supplied in separate Canvas fingerprint files. :param reactant_fp_files: List of reactant fingerprint files. :type reactant_fp_files: list[str] """ nr = self.reactant_count self.max_products = 1 # Keep track of duplicate reactant files while reading and scoring. prev_reactants = {} self._reactant_equiv = [[] for i in range(nr)] for i in range(nr): if reactant_fp_files[i] in prev_reactants: iprev = prev_reactants[reactant_fp_files[i]] self._reactant_equiv[i].append(iprev) self._reactant_equiv[iprev].append(i) self.reactant_titles[i] = self.reactant_titles[iprev] self.reactant_smiles[i] = self.reactant_smiles[iprev] self.reactant_indices[i] = self.reactant_indices[iprev] self.reactant_fps[i] = self.reactant_fps[iprev] else: self.reactant_titles[i], self.reactant_smiles[i], \ self.reactant_indices[i], self.reactant_fps[i] = \ fp_utils.rank_reactants(reactant_fp_files[i], self._query_fp, self.max_reactants) # yapf: disable prev_reactants[reactant_fp_files[i]] = i self.max_products *= len(self.reactant_fps[i]) def _rank_reactants_in_rfpdb_file(self, reactant_fp_file, reactant_classes): """ Ranks reactants by Tversky similarity to query when reactants are supplied in a single RfpDatabase fingerprint file. :param reactant_fp_file: Reactant fingerprint file :type reactant_fp_file: str :param reactant_classes: Reactant classes to be ranked. :type reactant_classes: list[str] """ nr = self.reactant_count self.max_products = 1 unique_reactant_classes = set(reactant_classes) screener = phase.RfpScreener(reactant_fp_file, self._query_fp.toVector(), list(unique_reactant_classes), self.max_reactants) # Keep track of duplicate reactant classes while storing hit data. prev_reactants = {} self._reactant_equiv = [[] for i in range(nr)] for i, reactant_class in enumerate(reactant_classes): if reactant_class in prev_reactants: iprev = prev_reactants[reactant_class] self._reactant_equiv[i].append(iprev) self._reactant_equiv[iprev].append(i) else: prev_reactants[reactant_class] = i hits = screener.getHits(reactant_class) self.reactant_titles[i] = [hit.name for hit in hits] self.reactant_smiles[i] = [hit.smiles for hit in hits] self.reactant_indices[i] = [hit.id - 1 for hit in hits] self.reactant_fps[i] = [ canvas.ChmSparseBitset(hit.onbits) for hit in hits ] self.max_products *= len(hits)