Source code for schrodinger.analysis.enrichment.metrics

"""
Stand-alone functions for calculating metrics. The metrics include terms
such as Receiver Operator Characteristic area under the curve (ROC),
Enrichment Factors (EF), and Robust Initial Enhancement (RIE).

Copyright Schrodinger, LLC. All rights reserved.
"""

import itertools
import math
import textwrap

import schrodinger.utils.log as log

logger = log.get_output_logger(__file__)


def _check_positive_int(value, variable_name):
    """
    Raise a ValueError if value is not a positive integer.
    """
    if not isinstance(value, int) or (value <= 0):
        raise ValueError(f"{variable_name} must be a positive integer.")


def _check_none_or_positive_int(value, variable_name):
    """
    Raise a ValueError if value is not None or a positive integer.
    """
    if not isinstance(value, int) or (value <= 0):
        if value is not None:
            msg = f"{variable_name} must be None or a positive integer."
            raise ValueError(msg)


def _check_non_negative_int(value, variable_name):
    """
    Raise a ValueError if value is not a non-negative integer.
    """
    if not isinstance(value, int) or (value < 0):
        raise ValueError(f"{variable_name} must be a non-negative integer.")


def _check_non_empty_list(lst, lst_name):
    """
    Raise a ValueError if lst is an empty list.
    """
    if (not isinstance(lst, list)) or (len(lst) == 0):
        raise ValueError(f"{lst_name} must be a non-empty list.")


def _get_diversity_factor(fingerprint_comp, active_ranks, title_ranks,
                          n_actives_sampled_set):
    """
    df = (1- min_Tc_actives)/(1 - min_Tc_total_actives)
    O(N^2) complexity, N = total number of actives.

    :return: a float representation of the diversity factor.
    :rtype: float

    """
    active_ranks = active_ranks[:n_actives_sampled_set]

    active_fingerprint = fingerprint_comp.active_fingerprint
    fp_sim = fingerprint_comp.fp_sim
    min_Tc_total_actives = fingerprint_comp.min_Tc_total_actives

    # The highest possible value for minTc is 1
    min_Tc_actives = 1.0

    if not active_fingerprint:
        msg = "_get_diversity_factor: missing fingerprints. minTc set to 1."
        logger.debug(msg)
        return min_Tc_actives

    for i_rank, j_rank in itertools.combinations(active_ranks, 2):
        sim = fp_sim.calculateSimilarity(
            active_fingerprint[title_ranks[i_rank]],
            active_fingerprint[title_ranks[j_rank]])
        if sim < min_Tc_actives:
            min_Tc_actives = sim

    diverse_factor = (1 - min_Tc_actives) / (1 - min_Tc_total_actives)

    return diverse_factor


[docs]def get_active_sample_size_star(total_actives, total_ligands, adjusted_active_ranks, fraction_of_actives): """ The size of the decoy sample set required to recover the specified fraction of actives. If there are fewer ranked actives than the requested fraction of all actives then the number of total_ligands is returned. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param adjusted_active_ranks: Modified active ranks; each rank is improved by the number of preceding actives. For example, a screen result that placed three actives as the first three ranks, [1, 2, 3], has adjusted ranks of [1, 1, 1]. In this way, actives are not penalized by being outranked by other actives. :type adjusted_active_ranks: list(int) :param fraction_of_actives: Decimal notation for the fraction of sampled actives, used to determine the sample set size. :type fraction_of_actives: float :return: The size of the decoy sample set required to recover the specified fraction of actives. :rtype: int """ n_actives = int(round(total_actives * fraction_of_actives)) n_sampled_set = None try: # The sampled set size is the rank of the active at the index # needed to recover n_actives -1, because it is zero-based, # plus the number of previously ranked decoys. n_sampled_set = \ adjusted_active_ranks[n_actives - 1] + \ len(adjusted_active_ranks[:n_actives - 2]) except IndexError: n_sampled_set = total_ligands return n_sampled_set
[docs]def get_active_sample_size(total_actives, total_ligands, active_ranks, fraction_of_actives): """ The size of the sample set required to recover the specified fraction of actives. If there are fewer ranked actives than the requested fraction of all actives then the number of total_ligands is returned. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param fraction_of_actives: Decimal notation for the fraction of sampled actives, used to determine the sample set size. :type fraction_of_actives: float :return: the size of the sample set required to recover the specified fraction of actives. :rtype: int """ n_actives = math.ceil(total_actives * fraction_of_actives) n_sampled_set = None if n_actives > len(active_ranks): n_sampled_set = total_ligands else: for index, rank in enumerate(active_ranks): if index + 1 <= n_actives: n_sampled_set = rank return n_sampled_set
[docs]def get_decoy_sample_size(total_actives, total_ligands, active_ranks, fraction_of_decoys): """ Returns the size of the sample set required to recover the specified fraction of decoys. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param fraction_of_decoys: Decimal notation for the fraction of sampled decoys, used to determine the sample set size. :type fraction_of_decoys: float :return: Size of the sample set required to recover the specified fraction of decoys. :rtype: int """ total_decoys = total_ligands - total_actives n_decoys = int(round(fraction_of_decoys * total_decoys)) sampled_decoys = [] size_max = int(round(total_ligands + 1)) for rank in range(1, size_max): if rank not in active_ranks: sampled_decoys.append(rank) if len(sampled_decoys) == n_decoys: break n_sampled_set = 0 if sampled_decoys: n_sampled_set = sampled_decoys[-1] - 1 return n_sampled_set
[docs]def calc_ActivesInN(active_ranks, n_sampled_set): """ Return the number of the known active ligands found in a given sample size. :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param n_sampled_set: The number of rank results for which to calculate the metric. Every active with a rank less than or equal to this value will be counted as found in the set. :type n_sampled_set: int :return: the number of the known active ligands found in a given sample size. :rtype: int """ return len([r for r in active_ranks if r <= n_sampled_set])
[docs]def calc_ActivesInNStar(adjusted_active_ranks, n_sampled_set): """ Return the number of the known active ligands found in a given sample size. :param adjusted_active_ranks: Modified active ranks; each rank is improved by the number of preceding actives. For example, a screen result that placed three actives as the first three ranks, [1, 2, 3], has adjusted ranks of [1, 1, 1]. In this way, actives are not penalized by being outranked by other actives. :type adjusted_active_ranks: list(int) :param n_sampled_set: The number of rank results for which to calculate the metric. Every active with a rank less than or equal to this value will be counted as found in the set. :type n_sampled_set: int :return: the number of the known active ligands found in a given sample size. :rtype: int """ return len([r for r in adjusted_active_ranks if r <= n_sampled_set])
[docs]def calc_AveNumberOutrankingDecoys(active_ranks): """ The rank of each active is adjusted by the number of outranking actives. The number of outranking decoys is then defined as the adjusted rank of that active minus one. The number of outranking decoys is calculated for each docked active and averaged. :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :return: the average number of decoys that outranked the actives. :rtype: float """ # Note: # Only known actives that dock contribute to this number. # Experiments that dock one active with a high rank appear # dramatically better than experiments that dock all known actives # with so-so ranks. # Bind default value ave_num_outranking_decoys = None num_outranking_decoys = [] for rank in active_ranks: # The number of decoy ranked higher is the actives rank minus 1, # and less the number of preceeding actives. num_outranking_decoys.append((rank - 1) - active_ranks.index(rank)) if len(active_ranks) > 0: ave_num_outranking_decoys = \ sum(num_outranking_decoys) / len(active_ranks) return ave_num_outranking_decoys
[docs]def calc_DEF(total_actives, total_ligands, active_ranks, title_ranks, fingerprint_comp, n_sampled_set, min_actives=None): """ Diverse Enrichment Factor, calculated with respect to the number of total ligands. DEF is defined as:: 1 - (min_similarity_among_actives_in_sampled_set) DEF = EF * -------------------------------------------------- 1 - (min_similarity_among_all_actives) where 'n_sampled_set' is the number of *all* ranks in which to search for actives. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param title_ranks: *Unadjusted* integer rank keys for title. Not available for table inputs, or other screen results that don't list the title. :type title_ranks: dict(str, int) :param fingerprint_comp: Fingerprint component data class object :type fingerprint_comp: enrichment_input.FingerprintComponent :param n_sampled_set: The number of ranked decoy results for which to calculate the enrichment factor. :type n_sampled_set: int :param min_actives: The number of actives that must be within the n_sampled_set, otherwise the returned EF value is None. :type min_actives: int :return: Diverse Enrichment Factor (DEF) for the given sample size of the screen results. If fewer than min_actives are found in the set, or the calculation raises a ZeroDivisionError, the returned value is None. :rtype: float """ fp_gen = fingerprint_comp.fp_gen fp_sim = fingerprint_comp.fp_sim if not (fp_gen and fp_sim): logger.debug("calcDEF: fingerprint tools unavailable.") return None # Bind initial value. d_ef = calc_EF(total_actives, total_ligands, active_ranks, n_sampled_set, min_actives) diverse_factor = None # Number of actives in sampled set n_actives_sampled_set = calc_ActivesInN(active_ranks, n_sampled_set) # Determine the diversity modifier. Return early if the value # can't be calculated. if n_actives_sampled_set <= 1: diverse_factor = 0 # No ij sim calculation possible. logger.debug( "calcDEF %d actives in sample, no similarity to calculate." % n_actives_sampled_set) return None elif n_actives_sampled_set == 1 and total_actives == 1: diverse_factor = 1 else: diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks, title_ranks, n_actives_sampled_set) # Finally, do d_ef calc. d_ef *= diverse_factor return d_ef
[docs]def calc_DEFStar(total_actives, total_ligands, active_ranks, title_ranks, fingerprint_comp, n_sampled_decoy_set, min_actives=None): """ Here, Diverse EF* (DEF*) is defined as:: 1 - (min_similarity_among_actives_in_sampled_set) DEF = EF_star * -------------------------------------------------- 1 - (min_similarity_among_all_actives) where 'n_sampled_decoy_set' is the number of *decoy* ranks in which to search for actives. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param title_ranks: *Unadjusted* integer rank keys for title. Not available for table inputs, or other screen results that don't list the title. :type title_ranks: dict(str, int) :param fingerprint_comp: Fingerprint component data class object :type fingerprint_comp: enrichment_input.FingerprintComponent :param n_sampled_decoy_set: The number of ranked decoys for which to calculate the enrichment factor. :type n_sampled_decoy_set: int :param min_actives: The number of actives that must be within the n_sampled_decoy_set, otherwise the returned EF value is None. :type min_actives: int :return: Diverse Enrichment Factor (DEF*) for the given sample size of the screen results, calculated with respect to the total decoys instead of the more traditional total ligands. If fewer than min_actives are found in the set the returned value is None. :rtype: float """ fp_gen = fingerprint_comp.fp_gen fp_sim = fingerprint_comp.fp_sim if not (fp_gen and fp_sim): logger.debug("calcDEFStar: fingerprint tools unavailable.") return None # Bind initial value. defs = calc_EFStar(total_actives, total_ligands, active_ranks, n_sampled_decoy_set, min_actives) n_actives_sampled_set = 0 for rank in range(1, int(n_sampled_decoy_set + 2)): if rank in active_ranks: n_actives_sampled_set += 1 # Determine the diversity modifier. Return early if the value # can't be calculated. if n_actives_sampled_set <= 1: diverse_factor = 0 # No ij sim calculation possible. logger.debug( "calcDEFStar %d actives in sample, no similarity to calculate." % n_actives_sampled_set) return None elif n_actives_sampled_set == 1 and total_actives == 1: diverse_factor = 1 else: diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks, title_ranks, n_actives_sampled_set) # Finally, do d_ef calc. defs *= diverse_factor return defs
[docs]def calc_DEFP(total_actives, total_ligands, active_ranks, title_ranks, fingerprint_comp, n_sampled_decoy_set, min_actives=None): """ Diverse EF' (DEF') is defined as:: 1 - (min_similarity_among_actives_in_sampled_set) DEF' = EF' * -------------------------------------------------- 1 - (min_similarity_among_all_actives) :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param title_ranks: *Unadjusted* integer rank keys for title. Not available for table inputs, or other screen results that don't list the title. :type title_ranks: dict(str, int) :param fingerprint_comp: Fingerprint component data class object :type fingerprint_comp: enrichment_input.FingerprintComponent :param n_sampled_decoy_set: The number of ranked decoy results for which to calculate the enrichment factor. :type n_sampled_decoy_set: int :param min_actives: The number of actives that must be within the n_sampled_decoy_set, otherwise the returned EF' value is None. :type min_actives: int :return: Diverse Enrichment Factor prime (DEF') for a given sample size. If fewer than min_actives are found in the set the returned value is None. :rtype: float """ fp_gen = fingerprint_comp.fp_gen fp_sim = fingerprint_comp.fp_sim if not (fp_gen and fp_sim): logger.debug("calcDEFStar: fingerprint tools unavailable.") return None # Bind initial value. defp = calc_EFP(total_actives, total_ligands, active_ranks, n_sampled_decoy_set, min_actives) n_actives_sampled_set = 0 for rank in range(1, int(total_ligands) + 1): if rank in active_ranks: n_actives_sampled_set += 1 if rank >= n_sampled_decoy_set + 1: break # Determine the diversity modifier. Return early if the value # can't be calculated. if n_actives_sampled_set <= 1: diverse_factor = 0 # No ij sim calculation possible. logger.debug( "calcDEFStar %d actives in sample, no similarity to calculate." % n_actives_sampled_set) return None elif n_actives_sampled_set == 1 and total_actives == 1: diverse_factor = 1 else: diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks, title_ranks, n_actives_sampled_set) defp *= diverse_factor return defp
[docs]def calc_EF(total_actives, total_ligands, active_ranks, n_sampled_set, min_actives=None): """ Calculates the Enrichment factor (EF) for the given sample size of the screen results. If fewer than min_actives are found in the set, or the calculation raises a ZeroDivisionError, the returned value is None. EF is defined as:: n_actives_in_sampled_set / n_sampled_set EF = ---------------------------------------- total_actives / total_ligands where 'n_sampled_set' is the number of *all* ranks in which to search for actives. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param n_sampled_set: The number of ranked results for which to calculate the enrichment factor. :type n_sampled_set: int :param min_actives: The number of actives that must be within the n_sampled_set, otherwise the returned EF value is None. :type min_actives: int :return: enrichment factor :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_positive_int(n_sampled_set, "n_sampled_set") _check_non_empty_list(active_ranks, "active_ranks") _check_none_or_positive_int(min_actives, "min_actives") # Bind default value ef = None # Number of actives in sampled set. n_actives_sampled_set = calc_ActivesInN(active_ranks, n_sampled_set) # Sanity check this threshold. Return early if the value can't # be calculated. if min_actives and (min_actives > n_actives_sampled_set): logger.warning( "calc_EF: %d minimum actives, but only %d in the sample." % (min_actives, n_actives_sampled_set)) return ef # Finally, do ef calc. ef = (n_actives_sampled_set / n_sampled_set) / \ (total_actives / total_ligands) return ef
[docs]def calc_EFStar(total_actives, total_ligands, active_ranks, n_sampled_decoy_set, min_actives=None): """ Calculate the Enrichment factor* (EF*) for the given sample size of the screen results, calculated with respect to the total decoys instead of the more traditional total ligands. If fewer than min_actives are found in the set the returned value is None. Here, EF* is defined as:: n_actives_in_sampled_set / n_sampled_decoy_set EF* = ---------------------------------------------- total_actives / total_decoys where 'n_sampled_decoy_set' is the number of *decoy* ranks in which to search for actives. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param n_sampled_decoy_set: The number of ranked decoys for which to calculate the enrichment factor. :type n_sampled_decoy_set: int :param min_actives: The number of actives that must be within the n_sampled_decoy_set, otherwise the returned EF value is None. :type min_actives: int :return: enrichment factor* :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(active_ranks, "active_ranks") _check_non_negative_int(n_sampled_decoy_set, "n_sampled_decoy_set") _check_none_or_positive_int(min_actives, "min_actives") # Bind default value. ef = None total_decoys = total_ligands - total_actives if total_decoys <= 0: raise ValueError("total_ligands must be larger than total_actives.") n_decoys = 0.0 n_actives = 0.0 n_actives_sampled_set = 0.0 for rank in range(1, int(n_sampled_decoy_set + 2)): if rank in active_ranks: n_actives += 1.0 else: n_decoys += 1.0 frac_active_found = n_actives / total_actives frac_decoy_found = n_decoys / total_decoys n_actives_sampled_set = n_actives if min_actives and (n_actives_sampled_set < min_actives): logger.warning( "calc_EFStar: %d minimum actives, but only %d in the sample." % (min_actives, n_actives_sampled_set)) ef = None else: try: ef = frac_active_found / frac_decoy_found except ZeroDivisionError: logger.debug("calc_EFStar: caught ZeroDivisionError.") if frac_active_found: logger.debug("Actives found in the sampled set, return inf.") ef = float('inf') else: logger.debug("No actives in the sampled set, return None.") ef = None return ef
[docs]def calc_EFP(total_actives, total_ligands, active_ranks, n_sampled_decoy_set, min_actives=None): """ Calculates modified enrichment factor defined using the average of the reciprocals of the EF* enrichment factors for recovering the first aa% of the known actives, Enrichment Factor prime (EF'). EF'(x) will be larger than EF*(x) if the actives in question come relatively early in the sequence, and smaller if they come relatively late. If fewer than min_actives are found in the set the returned value is None. EF' is defined as:: n_actives_sampled_set EF' = -------------------------------------------- cumulative_sum(frac. decoys/frac. actives) :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param n_sampled_decoy_set: The number of ranked decoys for which to calculate the enrichment factor. :type n_sampled_decoy_set: int :param min_actives: The number of actives that must be within the n_sampled_decoy_set, otherwise the returned EF value is None. :type min_actives: int :return: enrichment factor prime :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(active_ranks, "active_ranks") _check_non_negative_int(n_sampled_decoy_set, "n_sampled_decoy_set") _check_none_or_positive_int(min_actives, "min_actives") # Bind default value efp = None total_decoys = total_ligands - total_actives if total_decoys <= 0: raise ValueError("total_ligands must be larger than total_actives.") nkount = 0.0 frac_actives_found = 0.0 frac_decoys_found = 0.0 ensum = 0.0 n_actives = 0.0 n_decoys = 0.0 active_ranks_set = set(active_ranks) for rank in range(1, int(total_ligands) + 1): if rank in active_ranks_set: n_actives += 1 frac_actives_found = n_actives / total_actives else: n_decoys += 1 frac_decoys_found = n_decoys / total_decoys if frac_actives_found > 0 and frac_decoys_found > 0: nkount += 1 ensum += frac_decoys_found / frac_actives_found if rank >= n_sampled_decoy_set + 1: break if n_actives == 0: logger.warning("calc_EFP: No actives found in the sample.") efp = None elif min_actives and (n_actives < min_actives): logger.warning( "calc_EFP: %d minimum actives, but only %d in the sample." % (min_actives, n_sampled_decoy_set)) efp = None else: try: efp = nkount / ensum except ZeroDivisionError: logger.debug("calc_EFP: caught ZeroDivisionError.") logger.debug("Actives found in the sampled set, return inf.") efp = float('inf') return efp
[docs]def calc_FOD(total_actives, total_ligands, active_ranks, fraction_of_actives): r""" Calculates the average fraction of decoys outranking the given fraction, provided as a float, of known active ligands. The returned value is None if a) the calculation raises a ZeroDivisionError, or b) fraction_of_actives generates more actives than are ranked, or c) the fraction_of_actives is greater than 1.0 FOD is defined as:: __ 1 \ number_outranking_decoys_in_sampled_set FOD = ------------- / --------------------------------------- num_actives -- total_decoys :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param fraction_of_actives: Decimal notation of the fraction of sampled actives, used to set the sampled set size. :type fraction_of_actives: float :return: Average fraction of outranking decoys. :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(active_ranks, "active_ranks") # Bind default value fod = None total_decoys = total_ligands - total_actives n_actives = int(round(total_actives * fraction_of_actives)) if total_decoys <= 0: raise ValueError("total_ligands must be larger than total_actives.") if (fraction_of_actives > 1.0) or (fraction_of_actives < 0.0): msg = textwrap.dedent(""" calc_FOD: fraction_of_actives must be within [0, 1], setting FOD as None. """) logger.warning(msg) return fod if (n_actives == 0): logger.warning("calc_FOD: n_actives is 0, setting FOD as None.") return fod if (n_actives > len(active_ranks)): msg = textwrap.dedent(""" calc_FOD: fraction_of_actives is generating more actives than ranked, setting FOD as None. """) logger.warning(msg) return fod # Create a list of the frac. of outranking decoys for each active rank. fractions = [] for rank in active_ranks[0:n_actives]: # Number of decoys ranked higher is the rank of the active, # not counting itself (-1), and not count any preceeding actives # (index.(rank)) num_outranking_decoys = float((rank - 1) - active_ranks.index(rank)) fractions.append(num_outranking_decoys / total_decoys) # Finally, do FOD calc. fod = sum(fractions) / n_actives return fod
[docs]def calc_EFF(total_actives, total_ligands, adjusted_active_ranks, fraction_of_decoys): """ Calculate efficiency in distinguishing actives from decoys (EFF) on an absolute scale of 1 (perfect; all actives come before any decoys) to -1 (all decoys come before any actives); a value of 0 means that actives and decoys were recovered at equal proportionate rates. The returned value is None if the calculation raises a ZeroDivisionError. EFF is defined as:: frac. actives in sample EFF = (2* -----------------------------------------------) - 1 frac actives in sample + frac. decoys in sample :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param adjusted_active_ranks: Modified active ranks; each rank is improved by the number of preceding actives. For example, a screen result that placed three actives as the first three ranks, [1, 2, 3], has adjusted ranks of [1, 1, 1]. In this way, actives are not penalized by being outranked by other actives. :type adjusted_active_ranks: list(int) :param fraction_of_decoys: The size of the set is in terms of the number of decoys in the screen. For example, given 1000 decoys and fraction_of_decoys = 0.20, actives that appear within the first 200 ranks are counted. :type fraction_of_decoys: float :return: Active recovery efficiency at a particular sample set size :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(adjusted_active_ranks, "adjusted_active_ranks") # Bind default value eff = None total_decoys = total_ligands - total_actives if total_decoys <= 0: raise ValueError("total_ligands must be larger than total_actives.") # Determine fraction of database to assay. n_sampled_set = total_decoys * fraction_of_decoys # Number of actives in sampled set. n_actives_sampled_set = calc_ActivesInNStar(adjusted_active_ranks, n_sampled_set) frac_actives = n_actives_sampled_set / total_actives frac_decoys = n_sampled_set / total_decoys # Finally, do eff calc. try: eff = 2.0 * (frac_actives / (frac_actives + frac_decoys)) - 1 except ZeroDivisionError: logger.warning("calc_EFF: caught ZeroDivisionError.") return eff
[docs]def calc_BEDROC(total_actives, total_ligands, active_ranks, alpha=20.0): """ Boltzmann-enhanced Discrimination Receiver Operator Characteristic area under the curve. The value is bounded between 1 and 0, with 1 being ideal screen performance. The default alpha=20 weights the first ~8% of screen results. When alpha*Ra << 1, where Ra is the radio of total actives to total ligands, and alpha is the exponential prefactor, the BEDROC metric takes on a probabilistic meaning. Calculated as described by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 36. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param alpha: Exponential prefactor for adjusting early enrichment emphasis. Larger values more heavily weight the early ranks. alpha = 20 weights the first ~8% of the screen, alpha = 10 weights the first ~10% of the screen, alpha = 50 weights the first ~3% of the screen results. :type alpha: float :return: a tuple of two floats, the first represents the area under the curve for the Boltzmann-enhanced discrimination of ROC (BEDROC) analysis, the second is the alpha*Ra term. :rtype: (float, float) """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(active_ranks, "active_ranks") if alpha <= 0: raise ValueError("alpha must be greater than zero.") bedroc = None Ra = total_actives / total_ligands alphaRa = alpha * Ra rie = calc_RIE(total_actives, total_ligands, active_ranks, alpha) term_1 = None term_2 = None try: term_1 = Ra * math.sinh(alpha / 2.0) /\ (math.cosh(alpha / 2.0) - math.cosh(alpha / 2.0 - Ra * alpha)) term_2 = 1.0 / (1.0 - math.exp(alpha * (1.0 - Ra))) bedroc = rie * term_1 + term_2 except ZeroDivisionError: raise ZeroDivisionError("unexpected ZeroDivisionError.") except OverflowError: raise OverflowError("unexpected OverflowError.") return (bedroc, alphaRa)
[docs]def calc_RIE(total_actives, total_ligands, active_ranks, alpha=20.0): """ Robust Initial Enhancement (RIE). Active ranks are weighted with an continuously decreasing exponential term. Large positive RIE values indicate better screen performance. Calculated as described by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 18. :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param alpha: Exponential prefactor for adjusting early enrichment emphasis. Larger values more heavily weight the early ranks. alpha = 20 weights the first ~8% of the screen, alpha = 10 weights the first ~10% of the screen, alpha = 50 weights the first ~3% of the screen results. :type alpha: float :return: a float for the Robust Initial Enhancement (RIE). :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(active_ranks, "active_ranks") if alpha <= 0: raise ValueError("alpha must be greater than zero.") Ra = total_actives / total_ligands wght_sum = sum( [math.exp(-1 * alpha * r_i / total_ligands) for r_i in active_ranks]) try: rie = wght_sum / (Ra * (1.0 - math.exp(-alpha)) / (math.exp(alpha / total_ligands) - 1.0)) except ZeroDivisionError: msg = textwrap.dedent(""" calc_RIE: There were zero total actives for RIE calculation, setting RIE to zero. """) logger.warning(msg) rie = 0.0 return rie
[docs]def calc_AUAC(total_actives, total_ligands, total_ranked, active_ranks): """ Area Under the Accumulation Curve (AUAC). The value is bounded between 1 and 0, with 1 being ideal screen performance. Calculated as described by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 8. (execept adjusted to a trapezoidal integration, to decrease errors for small data sets). :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param total_ranked: The total number of ligands ranked by the virtual screen scoring metric. :type total_ranked: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :return: A float representation of the Area Under the Accumulation Curve. :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_positive_int(total_ranked, "total_ranked") _check_non_empty_list(active_ranks, "active_ranks") # Integrate stepwise over the ranked actives with curve terminating # at (Ntotal,Nranked_actives) area = sum([total_ligands - rank for rank in active_ranks]) # Add trapezoidal correction area += len(active_ranks) / 2.0 # Add correction for unranked actives, evenly distributed # after last ranked ligand. Curve now drawn to (Ntotal,Nactives) area += (total_actives - len(active_ranks)) * (total_ligands - total_ranked) / 2.0 # Normalize by the Ntotal*Nactives area to get a ratio: auac = area / (total_actives * total_ligands) return auac
[docs]def calc_ROC(total_actives, total_ligands, adjusted_active_ranks): """ Calculates a representation of the Receiver Operator Characteristic area underneath the curve. Typically interpreted as the probability an active will appear before an inactive. A value of 1.0 reflects ideal performance, a value of 0.5 reflects a performance on par with random selection. Calculated as described by: Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq A.8 Clasically ROC area is defined as:: AUAC Ra ROC = ------ - ----- Ri 2Ri Where AUAC is the area under the accumulation curve, Ri is the ratio of inactives, Ra is the ratio of actives. A different method is used here in order to account for unranked actives - see PYTHON-3055 & PYTHON-3106 :param total_actives: The total number of active ligands in the screen, ranked and unranked. :type total_actives: int :param total_ligands: The total number of ligands (actives and unknowns/ decoys) used in the screen. :type total_ligands: int :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :return: receiver operator characteristic area underneath the curve :rtype: float """ _check_positive_int(total_actives, "total_actives") _check_positive_int(total_ligands, "total_ligands") _check_non_empty_list(adjusted_active_ranks, "adjusted_active_ranks") total_decoys = total_ligands - total_actives if total_decoys <= 0: raise ValueError("total_ligands must be larger than total_actives.") # Integrate stepwise over the ranked actives with curve terminating at # (Ndecoys,Nranked_actives) area = sum( [total_decoys - adj_rank + 1 for adj_rank in adjusted_active_ranks]) # Normalize by the Ndecoys*Nactives area roc = area / (total_decoys * total_actives) return roc