Source code for schrodinger.analysis.enrichment.calculator

"""
This module contains the class for generating the default enrichment report.

Example metrics from two different screens:

The enrichment metrics from example_A are generally more favorable than
those from example_B.

Enrichment Report
-----------------

Actives file: example_A_actives.txt
Results: example_A_dock_pv.rept
Total actives: 117
Total ligands(actives+decoys): 1117
Number of ranked actives: 117

BEDROC(alpha=160.9, alpha*Ra=16.8534): 1.000
BEDROC(alpha=20.0, alpha*Ra=2.0949): 0.914
BEDROC(alpha=8.0, alpha*Ra=0.8380): 0.868
ROC: 0.92
RIE: 7.65
Area under accumulation curve: 0.87
Ave. Number of outranking decoys: 82

Count and percentage of actives in top N% of decoy results.
# Actives (1%|2%|5%|10%|20%):    90|   90|   92|   94|   97
% Actives (1%|2%|5%|10%|20%):  76.9| 76.9| 78.6| 80.3| 82.9

Enrichment Factors with respect to N% sample size.
EF (1%|2%|5%|10%|20%):     9.5|    9.5|    9.4|    7.7|    4.1
EF*(1%|2%|5%|10%|20%):      77|     38|     16|      8|    4.1
EF'(1%|2%|5%|10%|20%): 2.9e+02|1.7e+02|     54|     23|    9.9
Eff(1%|2%|5%|10%|20%):   0.974|  0.949|   0.88|  0.779|  0.611

Enrichment Factors with respect to N% actives recovered.
EF (40%|50%|60%|70%|80%|90%|100%):   9.3|  9.4|  9.4|  9.2|  5.7|    2|  1.3
EF*(40%|50%|60%|70%|80%|90%|100%): 4e+02|5e+02|6e+02|2.3e+02|   13|  2.2|  1.4
EF'(40%|50%|60%|70%|80%|90%|100%): 3.8e+02|4.3e+02|4.7e+02|4.3e+02|   38|  4.7|  2.7
FOD(40%|50%|60%|70%|80%|90%|100%): 9e-05|0.0003|0.0004|0.0006|0.003| 0.03| 0.08


Enrichment Report
-----------------

Actives file: example_B_actives.txt
Results: example_B_dock_pv.rept
Total actives: 62
Total ligands(actives+decoys): 1062
Number of ranked actives: 62

BEDROC(alpha=160.9, alpha*Ra=9.3934): 0.703
BEDROC(alpha=20.0, alpha*Ra=1.1676): 0.256
BEDROC(alpha=8.0, alpha*Ra=0.4670): 0.323
ROC: 0.72
RIE: 3.02
Area under accumulation curve: 0.71
Ave. Number of outranking decoys: 281

Count and percentage of actives in top N% of decoy results.
# Actives (1%|2%|5%|10%|20%):     8|    8|    9|   13|   23
% Actives (1%|2%|5%|10%|20%):  12.9| 12.9| 14.5| 21.0| 37.1

Enrichment Factors with respect to N% sample size.
EF (1%|2%|5%|10%|20%):      12|    6.5|    2.9|    2.1|    1.6
EF*(1%|2%|5%|10%|20%):      13|    6.5|    2.9|    2.1|    1.9
EF'(1%|2%|5%|10%|20%):      23|     12|    5.3|    3.4|    2.3
Eff(1%|2%|5%|10%|20%):   0.856|  0.732|  0.488|  0.354|  0.299

Enrichment Factors with respect to N% actives recovered.
EF (40%|50%|60%|70%|80%|90%|100%):   1.8|    2|  1.9|    2|  1.6|  1.6|    1
EF*(40%|50%|60%|70%|80%|90%|100%):   1.9|  2.1|    2|  2.1|  1.6|  1.6|  1.1
EF'(40%|50%|60%|70%|80%|90%|100%):   2.3|  2.2|  2.2|  2.2|  2.1|    2|  1.6
FOD(40%|50%|60%|70%|80%|90%|100%):   0.1|  0.1|  0.1|  0.2|  0.2|  0.2|  0.3

Copyright Schrodinger, LLC. All rights reserved.
"""

import os
import sys
from collections import defaultdict

from decorator import decorator

import schrodinger.application.canvas.fingerprint as canvas_fp
import schrodinger.application.canvas.similarity as canvas_sim
import schrodinger.structure as structure
import schrodinger.utils.fileutils as fileutils
import schrodinger.utils.log as log
from schrodinger.analysis.enrichment import enrichment_input
from schrodinger.analysis.enrichment import metrics
from schrodinger.analysis.enrichment import plotter
from schrodinger.application.glide.utils import is_valid_pv_file

CSV_FILE_EXT = ('.csv', '.CSV')
ST_TITLE_HEADER = "s_m_title"

logger = log.get_output_logger(__file__)


@decorator
def _exception_handler(func, *args, **kwargs):
    """
    A decorator to handle exceptions bubbling from functions in metrics.py.

    """
    # special_functions stores a list of Calculator class function names as
    # keys. These functions return multiple values, thus need to be
    # handled individually to ensure the conformity of return values.
    special_functions = {
        "calcBEDROC": (None, None),
    }
    try:
        return func(*args, **kwargs)
    except (ValueError, ZeroDivisionError, OverflowError, AttributeError) as e:
        func_name = func.__name__
        logger.debug(f"{func_name} caught {type(e).__name__}: {e}")
        logger.debug("Setting output as None.")
        return special_functions.get(func_name, None)


###############################################################################
# Classes
###############################################################################


[docs]class Calculator: """ A class to report default set of enrichment terms for a screen. By default, a report containing a suite of metrics is directed to standard out. :note: This is not the preferred way to obtain enrichment metrics. Please consider using parser and metric functions directly in enrichment_input.py and metrics.py if possible. :cvar ef_precision: Number of decimals when reporting EF values. Default = 2 :vartype ef_precision: int :cvar efp_precision: Number of decimals when reporting EF' values. Default = 2 :vartype efp_precision: int :cvar efs_precision: Number if decimals when reporting EF* values. Default = 2 :vartype efs_precision: int :cvar eff_precision: Number of decimals when reporting Eff values. Default = 3 :vartype eff_precision: int :cvar fod_precision: Number of decimals when reporting FOD values. Default = 1 :vartype fod_precision: int """ # Default reporting formats with %g idiom. ef_precision = 2 efs_precision = 2 efp_precision = 2 eff_precision = 3 fod_precision = 1
[docs] def __init__(self, actives, results, total_decoys=0): """ :param actives: File name or a list of strings containing all active titles. If a file name is provided, the input should be a valid csv or structure file, a raw text file containing one line per title is also acceptable. Duplicate titles are discarded, only the first occurrence is recorded. :type actives: str or list(str) :param results: File name, a list of strings, or a list of structure.Structure containing the virtual screening result ordered by the scoring metric. If a file name is provided, the input should be a valid csv file or structure file. Duplicate titles are discarded, only the first occurrence is recorded. :type results: str or list(str) or list(structure.Structure) :param total_decoys: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type total_decoys: int """ # Used for default enrichment report self.percentage_active = (0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) self.percentage_ligand = (0.01, 0.02, 0.05, 0.10, 0.20) # Values used in fingerprint components, uses in report() self.min_Tc_total_actives = None self.missing_active_titles = None # Parse the actives file/list self.active_titles = self._parseActives(actives) # Parse the results file/list self._parseResults(results, total_decoys) # Shortcut for self.report() if isinstance(actives, str): self.actives = os.path.basename(actives) else: self.actives = "List of active titles" if isinstance(results, str): self.results = os.path.basename(results) else: self.results = "List of ranked ligand titles" self.total_decoys = self.total_ligands - self.total_actives # Calculate all of the default metrics self._calculateMetrics()
def __repr__(self): rpr = "EnrichmentReporter(%s, %s, %d)" % ( self.actives, self.results, int(self.total_decoys) # Cast back to input integer. ) return rpr def _parseActives(self, actives_file): """ parse actives_file, return distinct active titles from the actives file. Repeated active titles are ignored. :param actives_file: File name or a list of strings containing all active titles. If a file name is provided, the input should be a valid csv or structure file, a raw text file containing one line per title is also acceptable. Duplicate titles are discarded, only the first occurrence is recorded. :type actives_file: str or list(str) :return: distinct active titles from the actives file. :rtype: set(str) """ if isinstance(actives_file, str): _, ext = fileutils.splitext(actives_file) # csv file if ext in CSV_FILE_EXT: active_titles = \ enrichment_input.extract_active_titles_from_csv( actives_file) # structure file elif fileutils.get_structure_file_format(actives_file): active_titles = \ enrichment_input.extract_active_titles_from_mae( actives_file) # assume input is a plain text file otherwise else: active_titles = \ enrichment_input.extract_active_titles_from_txt( actives_file) # Actives input is a sequence of structure titles. elif hasattr(actives_file, '__iter__'): if not actives_file: raise TypeError("Actives list must not be empty.") if isinstance(actives_file[0], str): active_titles = \ enrichment_input.extract_active_titles_from_list( actives_file) else: raise TypeError("Actives list must be a list of strings.") else: raise TypeError("Actives input type must be a string or a list.") return active_titles def _parseResults(self, results_file, num_decoy): """ Identify the file type of results_file and call the correct parser function to set rank and count related class variables. :param results_file: File name, a list of strings, or a list of structure.Structure containing the virtual screening result ordered by the scoring metric. If a file name is provided, the input should be a valid csv file or structure file. Duplicate titles are discarded, only the first occurrence is recorded. :type results_file: str or list(str) or list(structure.Structure) :param num_decoy: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type num_decoy: int """ if isinstance(results_file, str): _, ext = fileutils.splitext(results_file) # csv file if ext in CSV_FILE_EXT: self.total_actives, self.total_ligands, self.active_ranks, \ self.adjusted_active_ranks, self.total_ranked, \ self.title_ranks = \ enrichment_input.extract_ranks_from_csv( csv_file_name=results_file, active_titles=self.active_titles, num_decoy=num_decoy) # structure file elif fileutils.get_structure_file_format(results_file): self.total_actives, self.total_ligands, self.active_ranks, \ self.adjusted_active_ranks, self.total_ranked, \ self.title_ranks = \ enrichment_input.extract_ranks_from_mae( mae_file_name=results_file, active_titles=self.active_titles, num_decoy=num_decoy) self._getFingerprintComponent(results_file) else: msg = "Results must be a csv file or a maestro file." raise TypeError(msg) # Actives input is a sequence of structure titles. elif hasattr(results_file, '__iter__'): if not results_file: raise TypeError("Results list must not be empty.") # list of strings if isinstance(results_file[0], str): self.total_actives, self.total_ligands, self.active_ranks, \ self.adjusted_active_ranks, self.total_ranked, \ self.title_ranks = \ enrichment_input.extract_ranks_from_list( titles_iter=results_file, active_titles=self.active_titles, num_decoy=num_decoy) # list of structure.Structure elif isinstance(results_file[0], structure.Structure): self.total_actives, self.total_ligands, self.active_ranks, \ self.adjusted_active_ranks, self.total_ranked, \ self.title_ranks = \ enrichment_input.extract_ranks_from_structures( structure_iter=results_file, active_titles=self.active_titles, num_decoy=num_decoy) self._getFingerprintComponent(results_file) else: msg = "Results list must be a list of strings " \ "or structure.Structure." raise TypeError(msg) else: raise TypeError("Results input type must be a string or a list.") def _getFingerprintComponent(self, results_file): """ Initialize and return a data class object needed for fingerprint-related calculations. This is basically enrichment_input.getFingerprintComponent but we'll also need to find the missing active titles for the reporter. :param results_file: File name, a list of strings, or a list of structure.Structure containing the virtual screening result ordered by the scoring metric. If a file name is provided, the input should be a valid csv file or structure file. Duplicate titles are discarded, only the first occurrence is recorded. :type results_file: str or list(str) or list(structure.Structure) """ fp_gen = canvas_fp.CanvasFingerprintGenerator(logger=logger) fp_gen.setType('Linear') fp_gen.setAtomBondTyping(fp_gen.getDefaultAtomTypingScheme()) fp_sim = canvas_sim.CanvasFingerprintSimilarity(logger=logger) fp_sim.setMetric('Tanimoto') # Bind initial values title_ranks = {} hits_titles = set() active_fingerprint = {} if isinstance(results_file, str): # Assume Glide results are typical, skip the receptor in pv files file_index = 2 if is_valid_pv_file(results_file) else 1 for st in structure.StructureReader(results_file, file_index): title = st.property[ST_TITLE_HEADER] enrichment_input._fingerprint_gen(st, title, self.active_titles, fp_gen, hits_titles, title_ranks, active_fingerprint) elif hasattr(results_file, '__iter__'): for st in results_file: title = st.property[ST_TITLE_HEADER] enrichment_input._fingerprint_gen(st, title, self.active_titles, fp_gen, hits_titles, title_ranks, active_fingerprint) min_tc = enrichment_input._calc_min_tc_total_actives( fp_sim, active_fingerprint) self.min_Tc_total_actives = min_tc self.fingerprint_object = enrichment_input.FingerprintComponent( fp_gen, fp_sim, active_fingerprint, min_tc) self.missing_active_titles = \ [title for title in self.active_titles if title not in hits_titles] ################### # Metrics functions ################### def _getActiveSampleSizeStar(self, fraction_of_actives): return metrics.get_active_sample_size_star( total_actives=self.total_actives, total_ligands=self.total_ligands, adjusted_active_ranks=self.adjusted_active_ranks, fraction_of_actives=fraction_of_actives) def _getActiveSampleSize(self, fraction_of_actives): return metrics.get_active_sample_size( total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, fraction_of_actives=fraction_of_actives) def _getDecoySampleSize(self, fraction_of_decoys): return metrics.get_decoy_sample_size( total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, fraction_of_decoys=fraction_of_decoys)
[docs] @_exception_handler def calcEF(self, n_sampled_set, min_actives=None): return metrics.calc_EF(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, n_sampled_set=n_sampled_set, min_actives=min_actives)
[docs] @_exception_handler def calcEFStar(self, n_sampled_decoy_set, min_actives=None): return metrics.calc_EFStar(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, n_sampled_decoy_set=n_sampled_decoy_set, min_actives=min_actives)
[docs] @_exception_handler def calcEFP(self, n_sampled_decoy_set, min_actives=None): return metrics.calc_EFP(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, n_sampled_decoy_set=n_sampled_decoy_set, min_actives=min_actives)
[docs] @_exception_handler def calcFOD(self, fraction_of_actives): return metrics.calc_FOD(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, fraction_of_actives=fraction_of_actives)
[docs] @_exception_handler def calcEFF(self, fraction_of_decoys): return metrics.calc_EFF( total_actives=self.total_actives, total_ligands=self.total_ligands, adjusted_active_ranks=self.adjusted_active_ranks, fraction_of_decoys=fraction_of_decoys)
[docs] def calcActivesInN(self, n_sampled_set): return metrics.calc_ActivesInN(active_ranks=self.active_ranks, n_sampled_set=n_sampled_set)
[docs] def calcActivesInNStar(self, n_sampled_set): return metrics.calc_ActivesInNStar( adjusted_active_ranks=self.adjusted_active_ranks, n_sampled_set=n_sampled_set)
[docs] def calcAveNumberOutrankingDecoys(self): return metrics.calc_AveNumberOutrankingDecoys( active_ranks=self.active_ranks)
[docs] @_exception_handler def calcBEDROC(self, alpha=20.0): return metrics.calc_BEDROC(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, alpha=alpha)
[docs] @_exception_handler def calcRIE(self, alpha=20.0): return metrics.calc_RIE(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, alpha=alpha)
[docs] @_exception_handler def calcAUAC(self): return metrics.calc_AUAC(total_actives=self.total_actives, total_ligands=self.total_ligands, total_ranked=self.total_ranked, active_ranks=self.active_ranks)
[docs] @_exception_handler def calcROC(self): return metrics.calc_ROC( total_actives=self.total_actives, total_ligands=self.total_ligands, adjusted_active_ranks=self.adjusted_active_ranks)
[docs] @_exception_handler def calcDEF(self, n_sampled_set, min_actives=None): return metrics.calc_DEF(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, title_ranks=self.title_ranks, fingerprint_comp=self.fingerprint_object, n_sampled_set=n_sampled_set, min_actives=min_actives)
[docs] @_exception_handler def calcDEFStar(self, n_sampled_decoy_set, min_actives=None): return metrics.calc_DEFStar(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, title_ranks=self.title_ranks, fingerprint_comp=self.fingerprint_object, n_sampled_decoy_set=n_sampled_decoy_set, min_actives=min_actives)
[docs] @_exception_handler def calcDEFP(self, n_sampled_decoy_set, min_actives=None): return metrics.calc_DEFP(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, title_ranks=self.title_ranks, fingerprint_comp=self.fingerprint_object, n_sampled_decoy_set=n_sampled_decoy_set, min_actives=min_actives)
#################### # Plotter functions ####################
[docs] def calculateSpecificity(self, rank): return plotter.calc_specificity_with_rank( total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, rank=rank)
[docs] def getPercentScreenCurvePoints(self): return plotter.get_percent_screen_curve_points( total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks)
[docs] def getActiveRankCsvRows(self): return plotter.get_plot_data(total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, title_ranks=self.title_ranks)
[docs] def savePlot(self, png_file="plot.png", title='Screen Results', xlabel='1-Specificity', ylabel='Sensitivity'): return plotter.save_plot( total_actives=self.total_actives, total_ligands=self.total_ligands, active_ranks=self.active_ranks, adjusted_active_ranks=self.adjusted_active_ranks, total_ranked=self.total_ranked, png_file=png_file, title=title, xlabel=xlabel, ylabel=ylabel)
#################### # Reporter functions ####################
[docs] @staticmethod def format(value, precision=2): """ :param value: Float value to format as string. :type value: float or None :param precision: Number of digits after the decimal. :type precision: int :return: a string representation of the passed value. If the value is None then the returned string is 'n/a'. Uses %g formatting idiom so large values are returned as exponentials. :rtype: str """ str_value = 'n/a' if value is None else "%.*g" % (precision, value) return str_value
def _calculateMetrics(self): """ Sets a suite of enrichment factor terms as instance data members. """ active_sample_size_list = [ self._getActiveSampleSize(e) for e in self.percentage_active ] active_sample_size_list_star = [ self._getActiveSampleSizeStar(e) for e in self.percentage_active ] m_active_size_list = [ round(self.total_actives * e) for e in self.percentage_active ] ligand_sample_size_list = [ self._getDecoySampleSize(e) for e in self.percentage_ligand ] m_ligand_size_list = [ round(self.total_ligands * e) for e in self.percentage_ligand ] self.ave_num_outranking_decoys = self.calcAveNumberOutrankingDecoys() self.bedroc20, self.alphaRa20 = self.calcBEDROC(alpha=20.0) self.bedroc160_9, self.alphaRa160_9 = self.calcBEDROC(alpha=160.9) self.bedroc8_0, self.alphaRa8_0 = self.calcBEDROC(alpha=8.0) self.roc = self.calcROC() self.rie = self.calcRIE() self.auac = self.calcAUAC() for pct, n_sampled_set, n_sampled_decoy_set, min_actives in zip( self.percentage_active, active_sample_size_list, active_sample_size_list_star, m_active_size_list): ef_name = f'ef_{int(pct * 100)}' ef_value = self.calcEF(n_sampled_set, min_actives) setattr(self, ef_name, ef_value) efs_name = f'efs_{int(pct * 100)}' efs_value = self.calcEFStar(n_sampled_decoy_set, min_actives) setattr(self, efs_name, efs_value) efp_name = f'efp_{int(pct * 100)}' efp_value = self.calcEFP(n_sampled_decoy_set, min_actives) setattr(self, efp_name, efp_value) fod_stat_name = f"fod_{int(pct * 100)}" fod_stat_value = self.calcFOD(pct) setattr(self, fod_stat_name, fod_stat_value) for pct, n_sampled_set, n_sampled_decoy_set in zip( self.percentage_ligand, m_ligand_size_list, ligand_sample_size_list): stat_name = f"actives_in_top_{int(pct * 100)}_pct" stat_value = self.calcActivesInN(self.total_ligands * pct) setattr(self, stat_name, stat_value) pct_stat_name = f"pct_actives_in_top_{int(pct * 100)}_pct" pct_stat_value = (100.0 * stat_value / self.total_actives) setattr(self, pct_stat_name, pct_stat_value) stat_name_star = f"actives_in_top_{int(pct * 100)}_pct_star" stat_value_star = self.calcActivesInNStar(self.total_decoys * pct) setattr(self, stat_name_star, stat_value_star) pct_stat_name_star = f"pct_actives_in_top_{int(pct * 100)}_pct_star" pct_stat_value_star = (100.0 * stat_value_star / self.total_actives) setattr(self, pct_stat_name_star, pct_stat_value_star) ef_pct_name = f"ef_{int(pct * 100)}pct" ef_pct_value = self.calcEF(n_sampled_set) setattr(self, ef_pct_name, ef_pct_value) efs_pct_name = f"efs_{int(pct * 100)}pct" efs_pct_value = self.calcEFStar(n_sampled_decoy_set) setattr(self, efs_pct_name, efs_pct_value) efp_pct_name = f"efp_{int(pct * 100)}pct" efp_pct_value = self.calcEFP(n_sampled_decoy_set) setattr(self, efp_pct_name, efp_pct_value) eff_stat_name = f"eff_{int(pct * 100)}pct" eff_stat_value = self.calcEFF(pct) setattr(self, eff_stat_name, eff_stat_value) def_name = f"def_{int(pct * 100)}pct" def_value = self.calcDEF(n_sampled_set) setattr(self, def_name, def_value) defs_name = f"defs_{int(pct * 100)}pct" defs_value = self.calcDEFStar(n_sampled_decoy_set) setattr(self, defs_name, defs_value) defp_name = f"defp_{int(pct * 100)}pct" defp_value = self.calcDEFP(n_sampled_decoy_set) setattr(self, defp_name, defp_value) # Assign synomym attributes so clients of the deprecated # glideanalysis.Enrichment class are less affected by the change. old_ef_name = f"ef_{int(pct * 100)}pct" new_ef_name = f"efd_{int(pct * 100)}" new_ef_value = getattr(self, old_ef_name) setattr(self, new_ef_name, new_ef_value) old_ait_name = f"actives_in_top_{int(pct * 100)}_pct" new_ait_name = f"actives_in_top_{int(pct * 100)}_prcnt" new_ait_value = getattr(self, old_ait_name) setattr(self, new_ait_name, new_ait_value)
[docs] def report(self, file_handle=sys.stdout, header="", footer=""): """ Prints text summary of results to the file_handle. :param header: Header for the report. :type header: str :param footer: Footer for the report. :type footer: str :param file_handle: File handle-like object, default is sys.stdout. :type file_handle: file """ out_fh = file_handle results = self.results if not isinstance(results, str): results = 'Structure list (%d items)' % len(self.results) line = '=' * 80 out_fh.write(line) out_fh.write('\n') line = """ Enrichment Report ----------------- %s Actives file: %s Results: %s Total actives: %d Total ligands(actives+decoys): %d Number of ranked actives: %d BEDROC(alpha=160.9, alpha*Ra=%.4f): %.3f BEDROC(alpha=20.0, alpha*Ra=%.4f): %.3f BEDROC(alpha=8.0, alpha*Ra=%.4f): %.3f ROC: %.2f RIE: %.2f Area under accumulation curve: %.2f Ave. Number of outranking decoys: %d Minimum Tc over all active pairs: %s """ % (header, self.actives, results, self.total_actives, self.total_ligands, len( self.active_ranks), self.alphaRa160_9, self.bedroc160_9, self.alphaRa20, self.bedroc20, self.alphaRa8_0, self.bedroc8_0, self.roc, self.rie, self.auac, self.ave_num_outranking_decoys, self.format(self.min_Tc_total_actives, 2)) out_fh.write(line) line = """ Count and percentage of actives in top N%% of decoy results. %% Decoys |%s|%s|%s|%s|%s| # Actives |%s|%s|%s|%s|%s| %% Actives |%s|%s|%s|%s|%s| """ % ("%5s" % "1%", "%5s" % "2%", "%5s" % "5%", "%5s" % "10%", "%5s" % "20%", "%5d" % self.actives_in_top_1_pct_star, "%5d" % self.actives_in_top_2_pct_star, "%5d" % self.actives_in_top_5_pct_star, "%5d" % self.actives_in_top_10_pct_star, "%5d" % self.actives_in_top_20_pct_star, "%5.1f" % self.pct_actives_in_top_1_pct_star, "%5.1f" % self.pct_actives_in_top_2_pct_star, "%5.1f" % self.pct_actives_in_top_5_pct_star, "%5.1f" % self.pct_actives_in_top_10_pct_star, "%5.1f" % self.pct_actives_in_top_20_pct_star) out_fh.write(line) line = """ Count and percentage of actives in top N%% of results. %% Results |%s|%s|%s|%s|%s| # Actives |%s|%s|%s|%s|%s| %% Actives |%s|%s|%s|%s|%s| """ % ("%5s" % "1%", "%5s" % "2%", "%5s" % "5%", "%5s" % "10%", "%5s" % "20%", "%5d" % self.actives_in_top_1_pct, "%5d" % self.actives_in_top_2_pct, "%5d" % self.actives_in_top_5_pct, "%5d" % self.actives_in_top_10_pct, "%5d" % self.actives_in_top_20_pct, "%5.1f" % self.pct_actives_in_top_1_pct, "%5.1f" % self.pct_actives_in_top_2_pct, "%5.1f" % self.pct_actives_in_top_5_pct, "%5.1f" % self.pct_actives_in_top_10_pct, "%5.1f" % self.pct_actives_in_top_20_pct) out_fh.write(line) line = """ Enrichment Factors with respect to N%% sample size. %% Sample |%s|%s|%s|%s|%s| EF |%s|%s|%s|%s|%s| EF* |%s|%s|%s|%s|%s| EF' |%s|%s|%s|%s|%s| DEF |%s|%s|%s|%s|%s| DEF* |%s|%s|%s|%s|%s| DEF' |%s|%s|%s|%s|%s| Eff |%s|%s|%s|%s|%s| """ % ( "%7s" % "1%", "%7s" % "2%", "%7s" % "5%", "%7s" % "10%", "%7s" % "20%", "%7s" % self.format(self.ef_1pct, self.ef_precision), "%7s" % self.format(self.ef_2pct, self.ef_precision), "%7s" % self.format(self.ef_5pct, self.ef_precision), "%7s" % self.format(self.ef_10pct, self.ef_precision), "%7s" % self.format(self.ef_20pct, self.ef_precision), "%7s" % self.format(self.efs_1pct, self.efs_precision), "%7s" % self.format(self.efs_2pct, self.efs_precision), "%7s" % self.format(self.efs_5pct, self.efs_precision), "%7s" % self.format(self.efs_10pct, self.efs_precision), "%7s" % self.format(self.efs_20pct, self.efs_precision), "%7s" % self.format(self.efp_1pct, self.efp_precision), "%7s" % self.format(self.efp_2pct, self.efp_precision), "%7s" % self.format(self.efp_5pct, self.efp_precision), "%7s" % self.format(self.efp_10pct, self.efp_precision), "%7s" % self.format(self.efp_20pct, self.efp_precision), "%7s" % self.format(self.def_1pct, self.ef_precision), "%7s" % self.format(self.def_2pct, self.ef_precision), "%7s" % self.format(self.def_5pct, self.ef_precision), "%7s" % self.format(self.def_10pct, self.ef_precision), "%7s" % self.format(self.def_20pct, self.ef_precision), "%7s" % self.format(self.defs_1pct, self.efs_precision), "%7s" % self.format(self.defs_2pct, self.efs_precision), "%7s" % self.format(self.defs_5pct, self.efs_precision), "%7s" % self.format(self.defs_10pct, self.efs_precision), "%7s" % self.format(self.defs_20pct, self.efs_precision), "%7s" % self.format(self.defp_1pct, self.efp_precision), "%7s" % self.format(self.defp_2pct, self.efp_precision), "%7s" % self.format(self.defp_5pct, self.efp_precision), "%7s" % self.format(self.defp_10pct, self.efp_precision), "%7s" % self.format(self.defp_20pct, self.efp_precision), "%7s" % self.format(self.eff_1pct, self.eff_precision), "%7s" % self.format(self.eff_2pct, self.eff_precision), "%7s" % self.format(self.eff_5pct, self.eff_precision), "%7s" % self.format(self.eff_10pct, self.eff_precision), "%7s" % self.format(self.eff_20pct, self.eff_precision), ) out_fh.write(line) line = """ Enrichment Factors with respect to N%% actives recovered. %% Actives |%s|%s|%s|%s|%s|%s|%s| EF |%s|%s|%s|%s|%s|%s|%s| EF* |%s|%s|%s|%s|%s|%s|%s| EF' |%s|%s|%s|%s|%s|%s|%s| FOD |%s|%s|%s|%s|%s|%s|%s| """ % ( "%7s" % "40%", "%7s" % "50%", "%7s" % "60%", "%7s" % "70%", "%7s" % "80%", "%7s" % "90%", "%7s" % "100%", "%7s" % self.format(self.ef_40, self.ef_precision), "%7s" % self.format(self.ef_50, self.ef_precision), "%7s" % self.format(self.ef_60, self.ef_precision), "%7s" % self.format(self.ef_70, self.ef_precision), "%7s" % self.format(self.ef_80, self.ef_precision), "%7s" % self.format(self.ef_90, self.ef_precision), "%7s" % self.format(self.ef_100, self.ef_precision), "%7s" % self.format(self.efs_40, self.efs_precision), "%7s" % self.format(self.efs_50, self.efs_precision), "%7s" % self.format(self.efs_60, self.efs_precision), "%7s" % self.format(self.efs_70, self.efs_precision), "%7s" % self.format(self.efs_80, self.efs_precision), "%7s" % self.format(self.efs_90, self.efs_precision), "%7s" % self.format(self.efs_100, self.efs_precision), "%7s" % self.format(self.efp_40, self.efp_precision), "%7s" % self.format(self.efp_50, self.efp_precision), "%7s" % self.format(self.efp_60, self.efp_precision), "%7s" % self.format(self.efp_70, self.efp_precision), "%7s" % self.format(self.efp_80, self.efp_precision), "%7s" % self.format(self.efp_90, self.efp_precision), "%7s" % self.format(self.efp_100, self.efp_precision), "%7s" % self.format(self.fod_40, self.fod_precision), "%7s" % self.format(self.fod_50, self.fod_precision), "%7s" % self.format(self.fod_60, self.fod_precision), "%7s" % self.format(self.fod_70, self.fod_precision), "%7s" % self.format(self.fod_80, self.fod_precision), "%7s" % self.format(self.fod_90, self.fod_precision), "%7s" % self.format(self.fod_100, self.fod_precision), ) out_fh.write(line) out_fh.write('\n') out_fh.write('%s %s\n' % ('Rank'.ljust(6), 'Title'.ljust(73))) out_fh.write('%s %s\n' % ('-' * 6, '-' * 73)) for rank in sorted(self.title_ranks): title = self.title_ranks[rank] out_fh.write('%s %s\n' % (str(rank).ljust(6), title.ljust(73))) out_fh.write('\n') if self.missing_active_titles: out_fh.write("Missing ranks for:\n") out_fh.write("\n".join(self.missing_active_titles)) out_fh.write('\n') out_fh.write(footer) out_fh.write('\n') line = '=' * 80 out_fh.write(line) out_fh.write('\n')
[docs] def getCsvRows(self): """ Return a list of two lists, the first inner list contains all metric names, the other contains all corresponding metric values. :return: a list of header and enrichment value tuples. :rtype: list """ percentage_active = [int(pct * 100) for pct in self.percentage_active] percentage_ligand = [int(pct * 100) for pct in self.percentage_ligand] dict_keys = ("actives_in_top_pct", "pct_actives_in_top_pct", "actives_in_top_pct_star", "pct_actives_in_top_pct_star", "ef_pct", "efs_pct", "efp_pct", "def_pct", "defs_pct", "defp_pct", "eff_pct", "ef_actives", "efs_actives", "efp_actives", "fod_actives") value_storage = defaultdict(list) for pct in percentage_ligand: value_storage[dict_keys[0]].append( (f"# Actives {pct}%", getattr(self, f"actives_in_top_{pct}_pct"))) value_storage[dict_keys[1]].append( (f"% Actives {pct}%", getattr(self, f"pct_actives_in_top_{pct}_pct"))) value_storage[dict_keys[2]].append( (f"# Actives {pct}%*", getattr(self, f"actives_in_top_{pct}_pct_star"))) value_storage[dict_keys[3]].append( (f"% Actives {pct}%*", getattr(self, f"pct_actives_in_top_{pct}_pct_star"))) value_storage[dict_keys[4]].append( (f"EF {pct}%", self.format(getattr(self, f"ef_{pct}pct"), self.ef_precision))) value_storage[dict_keys[5]].append( (f"EF* {pct}%", self.format(getattr(self, f"efs_{pct}pct"), self.efs_precision))) value_storage[dict_keys[6]].append( (f"EF' {pct}%", self.format(getattr(self, f"efp_{pct}pct"), self.efp_precision))) value_storage[dict_keys[7]].append( (f"DEF {pct}%", self.format(getattr(self, f"def_{pct}pct"), self.ef_precision))) value_storage[dict_keys[8]].append( (f"DEF* {pct}%", self.format(getattr(self, f"defs_{pct}pct"), self.efs_precision))) value_storage[dict_keys[9]].append( (f"DEF' {pct}%", self.format(getattr(self, f"defp_{pct}pct"), self.efp_precision))) value_storage[dict_keys[10]].append( (f"Eff {pct}%", self.format(getattr(self, f"eff_{pct}pct"), self.eff_precision))) for pct in percentage_active: value_storage[dict_keys[11]].append( (f"EF {pct}% actives", self.format(getattr(self, f"ef_{pct}"), self.ef_precision))) value_storage[dict_keys[12]].append( (f"EF* {pct}% actives", self.format(getattr(self, f"efs_{pct}"), self.efs_precision))) value_storage[dict_keys[13]].append( (f"EF' {pct}% actives", self.format(getattr(self, f"efp_{pct}"), self.efp_precision))) value_storage[dict_keys[14]].append( (f"FOD {pct}% actives", self.format(getattr(self, f"fod_{pct}"), self.fod_precision))) # yapf: disable enrich_data = [('Actives file', self.actives), ('Results file', self.results), ('Total ligands(actives+decoys)', self.total_ligands), ('Total actives', self.total_actives), ('Number of ranked actives', len(self.active_ranks)), ('BEDROC(alpha=160.9)', self.bedroc160_9), ('BEDROC(alpha=160.9) alpha*Ra', self.alphaRa160_9), ('BEDROC(alpha=20.0)', self.bedroc20), ('BEDROC(alpha=20.0) alpha*Ra', self.alphaRa20), ('BEDROC(alpha=8.0)', self.bedroc8_0), ('BEDROC(alpha=8.0) alpha*Ra', self.alphaRa8_0), ('ROC', self.roc), ('RIE', self.rie), ('Area under accumulation curve', self.auac), ('Ave. Num. of outranking decoys', self.ave_num_outranking_decoys)] # yapf: enable for k in dict_keys: enrich_data.extend(value_storage[k]) rows = [] rows.append([item[0] for item in enrich_data]) rows.append([item[1] for item in enrich_data]) return rows