Source code for schrodinger.active_learning.al_report

import csv
import json
import math
import os
import zipfile

import numpy as np
import reportlab.lib.utils as reportlabel_utils
import scipy
import scipy.stats
import sklearn.metrics as metrics
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
from reportlab.platypus import Image
from reportlab.platypus import Paragraph
from reportlab.platypus import Spacer
from reportlab.platypus import Table

from schrodinger.active_learning import al_utils
from schrodinger.active_learning.al_utils import PILOT_TASK
from schrodinger.active_learning.al_utils import SCREEN_TASK
from schrodinger.utils import log

import matplotlib  # isort:skip

matplotlib.use('Agg')  # isort:skip
import matplotlib.pyplot as plt  # isort:skip

try:
    import schrodinger.application.desmond.report_helper as rh
except ImportError:
    rh = None

logger = log.get_output_logger(__file__)

_GRID_LINE_PROPERTIES = dict(color='#bdbdbd', linestyle='--', linewidth=0.5)
RUN_INFO_TEXT_COLOR = "'#888888'"


[docs]def get_ligand_ml_metric(ligand_ml_model_file): """ Extract the test set metrics, test set labels and predictions from ligand_ml model file. :param ligand_ml_model_file: ligand_ml .qzip model file. :type ligand_ml_model_file: str :return: r2, mae, rmse, labels and prediction of the test set :rtype: float, float, float, 1d numpy array, 2d numpy array """ with zipfile.ZipFile(ligand_ml_model_file, 'r') as zipobj: basename = os.path.basename(ligand_ml_model_file) model_name = os.path.splitext(basename)[0] report_json = os.path.join(model_name, "smasher/report.json") with zipobj.open(report_json, "r") as json_file: report = json.load(json_file) metric = report["metrics"][0] r2, mae, rmse = metric["r2"], metric["mae"], metric["rmse"] testdata_path = metric["plot_path"] testdata_path_basename = os.path.basename(testdata_path) y_true_path = os.path.join(model_name, "smasher/metrics/", testdata_path_basename + "-holdout_y.npy") y_pred_path = os.path.join( model_name, "smasher/metrics/", testdata_path_basename + "-final_results.npy") y_true = np.load(zipobj.open(y_true_path)) y_pred = np.load(zipobj.open(y_pred_path)) return r2, mae, rmse, y_true, y_pred
[docs]def make_train_report(ligand_ml_model_file, report_path, iter_num): """ Generate a pdf file that records the test set metrics of the ligand_ml model. :param ligand_ml_model_file: ligand_ml .qzip model file. :type ligand_ml_model_file: str :param report_path: path of the pdf report :type report_path: str :param iter_num: current iteration number :type iter_num: int """ r2, mae, rmse, y_true, y_pred = get_ligand_ml_metric(ligand_ml_model_file) regression_plot_name = f"testset_regression_iter_{iter_num}.png" plot_regression(y_true, y_pred, regression_plot_name) sample_style_sheet = getSampleStyleSheet() metric_doc = rh.platypus.SimpleDocTemplate(report_path) title = f"Test Set Accuracy of Machine Learning Model (iteration {iter_num})" paragraph_1 = Paragraph(title, sample_style_sheet['Heading2']) paragraph_1.hAlign = "CENTER" paragraph_2 = Paragraph( f"R2: {r2:.2f} &nbsp RMSE: {rmse:.2f} &nbsp MAE:{mae:.2f}", sample_style_sheet['BodyText']) s = Spacer(width=0, height=0.5 * inch) img = get_image(regression_plot_name, 6 * inch) flowables = [paragraph_1, s, paragraph_2, img] metric_doc.build(flowables)
[docs]def get_image(path, width=1 * inch): """ Convert image file to reportlab image object that has the same aspect ratio and specified width. :param path: path of the image file. :type path: str :param width: width of the reportlab image. :type width: float :return: reportlab image :rtype: reportlab.platypus.Image """ tmp_img = reportlabel_utils.ImageReader(path) iw, ih = tmp_img.getSize() aspect = ih / float(iw) return Image(path, width=width, height=(width * aspect))
[docs]def get_report_maker(active_learning_job): """ Get corresponding report maker for the active learning job. It returns None for evaluate task since we do not have report for it yet. :param active_learning_job: active learning job to be processed. :type active_learning_job: ActiveLearningJob :return: corresponding report maker :rtype: ALPilotReportMaker """ if active_learning_job.args.task == PILOT_TASK: return ALPilotReportMaker(active_learning_job) # pdf report for screen task is disabled in 2021-1 # if active_learning_job.args.task == SCREEN_TASK: # return ALScreenReportMaker(active_learning_job) return
[docs]def get_time_cost(nodes, node_name): """ Return the time cost of a node. It returns 'Unavailable' if the time cost is not available. :param nodes: dict that maps node name to node object. :type nodes: dict{str: ActiveLearningNode} :param node_name: name of the active learning node of interest. :type node_name: str :return: time cost in h/m/s format. :rtype: str """ try: h, m, s = nodes[node_name].time_cost return f"{h:3d}h {m:02d}m {s:02d}s" except KeyError: return "Unavailable"
[docs]def get_score_pred_as_array(title_to_score, pred_score_file, discard_cutoff, ascending=True): """ Return the score, predicted score, prediction uncertainty of the ligands as the N X 3 numpy array. :param title_to_score: dict that maps ligand title to score. :type title_to_score: dict(str:float) :param pred_score_file: path of the ligand ml prediction .csv file. :type pred_score_file: str :param discard_cutoff: score cutoff for excluding the ligands in ML training set. :type discard_cutoff: float :param ascending: lower value means better ligand if ascending is True :type ascending: bool :return: numpy array of (num_of_ligands X (score, pred, uncertain)) :rtype: N X 3 numpy array """ score_pred_uncertainty = [] with open(pred_score_file, "r", newline='') as f_in: score_reader = csv.reader(f_in) next(score_reader) for row in score_reader: try: title, ml_score, uncertainty = row[1], float(row[-2]), \ float(row[-1]) except ValueError: logger.warning(f"Warning: Cannot get the prediction score or " f"uncertainty for ligand {title}") continue true_score = title_to_score.get(title, discard_cutoff) true_score_decorated = true_score if ascending else -true_score discard_cutoff_decorated = discard_cutoff if ascending else \ -discard_cutoff if true_score_decorated < discard_cutoff_decorated: score_pred_uncertainty.append( [true_score, ml_score, uncertainty]) return np.asarray(score_pred_uncertainty)
[docs]def calculate_recovery_ratio(label_pred, top_ratio): """ Calculate the recovery ratio of the best ligands based on label in different numbers of the top ligands predicted by ligand_ml. More negative value means better ligand. :param label_pred: numpy array contains the (label, prediction). :type label_pred: (number of ligands X 2) numpy array. :param top_ratio: top ratio of the ligands by label. :type top_ratio: float :return: (screen ratio, recovery ratio of top ligands defined by top_ratio) of all the ligands. :rtype: (number of ligands X 2) numpy array """ num_case = len(label_pred) top_num = math.ceil(num_case * top_ratio) label_pred = label_pred[label_pred[:, 0].argsort()] label_pred[:top_num, 0] = 1 label_pred[top_num:, 0] = 0 label_pred = label_pred[label_pred[:, 1].argsort()] screen_recovery_ratio = np.zeros((num_case, 2)) screen_recovery_ratio[:, 0] = np.arange(1, num_case + 1) / num_case screen_recovery_ratio[:, 1] = np.cumsum(label_pred[:, 0]) / top_num return screen_recovery_ratio
[docs]def plot_regression(y_true, y_pred, fname): """ Generate regression plot. This function is sightly modified from ligand_ml/plotting.py to change the labels of axis. :param y_true: test set label. :type y_true: 1d numpy array :param y_pred: ligand_ml prediction and uncertainty :type y_pred: 2d numpy array :param fname: filename to save the image :type fname: str """ plt.figure(figsize=(10, 6)) fig, ax = plt.subplots() means = y_pred[:, 0] stds = y_pred[:, 1] ax.set_xlabel('True Score', fontsize=14) ax.set_ylabel('ML Score', fontsize=14) ax.errorbar(y_true, means, yerr=scipy.stats.t.ppf(0.95, [13] * y_pred.shape[0]) * stds, fmt='o') lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes ] ax.plot(lims, lims) ax.set_xlim(lims) ax.set_ylim(lims) plt.grid(**_GRID_LINE_PROPERTIES) plt.savefig(fname, bbox_inches='tight', dpi=300)
[docs]def plot_recovery(recovery_results, fname): """ Generate and save recovery plot image. :param recovery_results: dict that maps top ratio to the recovery ratio numpy array. :type recovery_results: dict{float:np.array} :param fname: path of the saved image. :type fname: str """ plt.figure(figsize=(10, 6)) fig, ax = plt.subplots() ax.set_xlabel("ML Evaluate Percentage", fontsize=14) ax.set_ylabel("Recovery Percentage", fontsize=14) legends = [] for top_ratio, recovery_result in recovery_results.items(): ax.plot(recovery_result[:, 0], recovery_result[:, 1]) legends.append(f"Best {100*top_ratio:.1f}%") plt.legend(legends) plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], ["0%", "20%", "40%", "60%", "80%", "100%"]) plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], ["0%", "20%", "40%", "60%", "80%", "100%"]) plt.grid(**_GRID_LINE_PROPERTIES) plt.savefig(fname, bbox_inches='tight', dpi=300)
[docs]def make_regress_recovery_plots(y_true, y_pred_uncertain, top_ratio_samples, regress_text, recovery_text): """ Generate regression plot and recovery plot and include both in a table. Also return the recovery results for the sampled top ratios as a dict. """ regression_plot_name = "screen_regress.png" plot_regression(y_true, y_pred_uncertain, regression_plot_name) regression_image = get_image(regression_plot_name, 3.4 * inch) y_true_pred = np.stack( [y_true.reshape(-1), y_pred_uncertain[:, 0].reshape(-1)], axis=1) recovery_results = { x: calculate_recovery_ratio(y_true_pred, x) for x in top_ratio_samples } recovery_plot_name = "screen_recovery.png" plot_recovery(recovery_results, recovery_plot_name) recovery_image = get_image(recovery_plot_name, 3.5 * inch) table = [['True Score vs ML Prediction', "Recovery of Best Ligands"], [regression_image, recovery_image]] text_style = ParagraphStyle( 'small', fontSize=8, leading=8, ) regress_text = Paragraph(regress_text, text_style) recovery_text = Paragraph(recovery_text, text_style) table.append([regress_text, recovery_text]) return table, recovery_results
[docs]def make_recovery_table(recovery_results, screen_ratio_samples): """ Generate a list of list that contains the recovery ratio for certain top ratio and screen ratio. :param recovery_results: dict that maps top ratio to the recovery ratio numpy array. :type recovery_results: dict{float:np.array} :param screen_ratio_samples: list of screen ratios :type screen_ratio_samples: list(float) :return: table as a list of list, table caption, largest enrichment in the table. :rtype: list(list(str)), str, float """ def get_closest_number(result, sample_ratio): return result[np.abs(result[:, 0] - sample_ratio).argmin()][1] recovery_ratio = {} for top_ratio, result in recovery_results.items(): for sample_ratio in screen_ratio_samples: recovery_ratio[(top_ratio, sample_ratio)] = \ get_closest_number(result, sample_ratio) table = [["Recovery Percentage of"] + [f"Screen {x * 100:.1f}%" for x in screen_ratio_samples]] best_enrichment = 0 for top_ratio in recovery_results.keys(): table.append([f"Best {top_ratio * 100:.2f}%"] + [ f"{recovery_ratio[(top_ratio, x)] * 100:.1f}%" for x in screen_ratio_samples ]) tmp_max = max( [recovery_ratio[(top_ratio, x)] / x for x in screen_ratio_samples]) best_enrichment = max(best_enrichment, tmp_max) example_screen_ratio = screen_ratio_samples[0] example_top_ratio = list(recovery_results.keys())[0] example_recovery_ratio = recovery_ratio[(example_top_ratio, example_screen_ratio)] table_caption = f'The table shows that AL is able to recover ' \ f'{ example_recovery_ratio * 100:.1f}% of ' \ f'the best {example_top_ratio * 100:.2f}% of scored' \ f'ligands in the top {example_screen_ratio*100:.1f}% ' \ f'ligands predicted by the ML model.' return table, table_caption, best_enrichment
[docs]def get_conclusion_string(best_enrichment, job_type, high_enrich=10, low_enrich=2): """ Return the conclusion string based on the job type and the higheest enrichment we have in the recovery ratio table. """ if job_type == PILOT_TASK: if best_enrichment >= high_enrich: conclusion = "Pilot mode shows high enrichment over the best " \ "ligands. You can safely proceed with full AL job." elif low_enrich <= best_enrichment < high_enrich: conclusion = "Pilot mode shows moderate enrichment over the " \ "best ligands. " \ "AL will provide some signal for the system " \ "but may not have the ideal performance." else: conclusion = "Pilot mode does not show enough enrichment for the " \ "system. Please check your inputs." elif job_type == SCREEN_TASK: if best_enrichment >= high_enrich: conclusion = "You can expect high enrichment of best ligands in the " \ "rescored ligands." elif low_enrich <= best_enrichment < high_enrich: conclusion = "You can expect moderate enrichment of best " \ "ligands in the rescored ligands." else: conclusion = "Test data shows that the enrichment of best " \ "ligands in the rescored ligands maybe low. Please " \ "check your inputs." return conclusion
[docs]class ALReportMaker: """Base class for different types of AL report maker."""
[docs] def __init__(self, active_learning_job): """ Initialize the report maker for an active learning job """ self.job = active_learning_job self.Elements = [] self.doc = None self.title = "Active Learning Report by Schrodinger Inc"
[docs] def initReport(self, header): """ Initialize the report and add header information """ pdf_filename = self.report_filename self.doc = rh.platypus.SimpleDocTemplate(pdf_filename) self.doc.title = self.title self.doc.author = 'Schrodinger Inc' self.doc.leftMargin = 30. self.doc.rightMargin = 20. self.doc.topMargin = 10. self.doc.bottomMargin = 10. rh.report_add_logo(self.Elements) rh.header(self.Elements, header)
[docs]class ALPilotReportMaker(ALReportMaker):
[docs] def __init__(self, active_learning_job): super().__init__(active_learning_job) self.report_filename = active_learning_job.jobname + \ "_pilot_report.pdf"
[docs] def report(self): """ Function for building the report """ header = "Active Learning Pilot Task Analysis" self.initReport(header) self.addRunDetail() self.addRecoveryResults() self.doc.build(self.Elements, canvasmaker=rh.NumberedCanvas) al_utils.add_output_file(self.report_filename)
[docs] def addRunDetail(self): """ Add job specifications and running time cost information to the report """ Ele = self.Elements rh.add_spacer(Ele) number_ligands = self.job.args.pilot_size training_size = self.job.args.train_size if "grid" in self.job.args: if self.job.args.grid is not None: grid_file = self.job.args.grid else: grid_file = "Unavailable" else: grid_file = None host, ncpu = al_utils.get_host_ncpu() rh.pargph(Ele, '<u>Running Information </u>') rh.add_spacer(Ele) rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Job name</font>: " f"{self.job.jobname}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Job type</font>: " f"Pilot task") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Pilot size</font>: " f"{number_ligands}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Training size</font>: " f"{training_size}") if grid_file is not None: rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Grid file</font>: {grid_file}" ) gpu_info_str = '' if self.job.args.train_host is not None: ngpu = self.job.args.num_train_core gpu_info_str += f", {ngpu} training processor(s) on" \ f" {self.job.args.train_host}" rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Running nodes</font>: " f"{ncpu} CPU(s) on {host}" + gpu_info_str) rh.add_spacer(Ele) nodes_dict = self.job.finished_nodes table = [['', 'Pilot Ligands Scoring', 'Training', 'Evaluating']] pilot_score_time = get_time_cost(nodes_dict, 'PilotScoreNode_iter_0') training_time = get_time_cost(nodes_dict, 'LigandMLTrainNode_iter_1') evaluating_time = get_time_cost(nodes_dict, 'LigandMLEvalNode_iter_1') table += [[ 'Wall Time Cost', pilot_score_time, training_time, evaluating_time ]] width = [w * inch for w in [0.8, 1.5, 1.0, 1.0, 1.0]] ncols, nrows = len(table[0]), len(table) style = [ ('ALIGN', (1, 0), (-1, -1), 'CENTER'), ('ALIGN', (0, 1), (0, nrows - 1), 'RIGHT'), ('TEXTCOLOR', (0, 0), (ncols - 1, 0), rh.gray), ('TEXTCOLOR', (0, 0), (0, nrows - 1), rh.gray) ] # yapf: disable table = Table(table, width, style=style, hAlign='CENTER') Ele.append(table)
[docs] def addRecoveryResults(self): """ Add the regression plot, recovery plot, recovery table and conclusion to the report. """ Ele = self.Elements rh.add_spacer(Ele) rh.pargph(Ele, '<u>Pilot Results</u>') rh.add_spacer(Ele) title_to_score = self.job.known_title_to_score pred_score_file = self.job.finished_nodes[ 'LigandMLEvalNode_iter_1'].input_for_next_node['ligands_csv'] score_pred_uncertainty = get_score_pred_as_array( title_to_score, pred_score_file, self.job.discard_cutoff, self.job.ascending) y_true, y_pred_uncertainty = score_pred_uncertainty[:, 0], \ score_pred_uncertainty[:, 1:] rmse = metrics.mean_squared_error(score_pred_uncertainty[:, 0], score_pred_uncertainty[:, 1], squared=False) r2 = metrics.r2_score(score_pred_uncertainty[:, 0], score_pred_uncertainty[:, 1]) top_ratio_samples = [0.001, 0.002, 0.005] regress_text = f"This plot shows the true scores vs machine " \ f"learning model predictions of all the pilot " \ f"ligands. The R squared (R2) is {r2:.2f}. The " \ f"root-mean-square error (RMSE) is {rmse:.2f}." recovery_text = "The above plot shows how effectively the model can " \ "recover the top ligands by scoring early in " \ "the list of ML predictions. The production run " \ "on the full library should be expected to " \ "out-perform the pilot study." table, recovery_results = make_regress_recovery_plots( y_true, y_pred_uncertainty, top_ratio_samples, regress_text, recovery_text) style = [('TOPPADDING', (0, 1), (-1, -1), 1), ('ALIGN', (0, 0), (-1, -1), 'CENTER')] rh.add_table(Ele, table, style, [4, 4]) screen_ratio_samples = [0.01, 0.02, 0.05, 0.10, 0.20] table, table_caption, best_enrichment = \ make_recovery_table(recovery_results, screen_ratio_samples) width = [w * inch for w in [1.5, 1.0, 1.0, 1.0, 1.0, 1.0]] ncols, nrows = len(table[0]), len(table) style = [ ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('TEXTCOLOR', (0, 0), (ncols - 1, 0), rh.gray), ('TEXTCOLOR', (0, 0), (0, nrows - 1), rh.gray) ] # yapf: disable reportlab_table = Table(table, width, style=style, hAlign='CENTER') rh.add_spacer(Ele) rh.add_spacer(Ele) rh.pargph( Ele, "Recovery of the best scored ligands at different " "fractions of the top ligands according to the model.") rh.add_spacer(Ele) Ele.append(reportlab_table) rh.add_spacer(Ele) rh.pargph(Ele, table_caption) rh.add_spacer(Ele) rh.add_spacer(Ele) rh.pargph(Ele, '<u>Conclusion</u>') rh.add_spacer(Ele) conclusion = get_conclusion_string(best_enrichment, self.job.args.task) rh.pargph(Ele, conclusion)
[docs]class ALScreenReportMaker(ALReportMaker):
[docs] def __init__(self, active_learning_job): super().__init__(active_learning_job) self.report_filename = active_learning_job.jobname + \ "_screen_report.pdf"
[docs] def report(self): """ Function for building the report """ header = "Active Learning Screen Task Analysis" self.initReport(header) self.addRunDetail() self.addRecoveryResults() self.doc.build(self.Elements, canvasmaker=rh.NumberedCanvas) al_utils.add_output_file(self.report_filename)
[docs] def addRunDetail(self): """ Add job specifications and running time cost information to the report """ Ele = self.Elements rh.add_spacer(Ele) num_iter = self.job.args.num_iter grid_file = self.job.args.grid if self.job.args.grid is not None \ else "Unavailable" host, ncpu = al_utils.get_host_ncpu() rh.pargph(Ele, '<u>Running Information </u>') rh.add_spacer(Ele) rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Job name</font>: " f"{self.job.jobname}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Job type</font>: " f"Screen task") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Library size</font>: " f"{self.job.total_ligand_num}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Grid file</font>: {grid_file}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Training size</font>: " f"{self.job.args.train_size}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Number of iteration</font>: " f"{num_iter}") rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Number of rescore " f"ligands </font>: {self.job.args.num_rescore_ligand}") gpu_info_str = '' if self.job.args.train_host is not None: ngpu = self.job.args.num_train_core gpu_info_str += f", {ngpu} GPU(s) on {self.job.args.train_host}" rh.pargph( Ele, f"<font color={RUN_INFO_TEXT_COLOR}>Running nodes " f"ligands </font>: {ncpu} CPU(s) on {host}" + gpu_info_str) rh.add_spacer(Ele) nodes_dict = self.job.finished_nodes header_name = [ 'Wall Time Cost', 'Preparing', 'Scoring', 'Training', 'Evaluating' ] nodes_name_list = [ x.__name__ for x in self.job.getNodeClasses( bool(self.job.args.score_file), self.al_node_supplier) ] if self.job.args.num_rescore_ligand: header_name += ['Rescoring'] table = [header_name] for iter_now in range(1, num_iter + 1): row = [f"Iteration {iter_now}"] for node_name in nodes_name_list: time_cost = get_time_cost(nodes_dict, f"{node_name}_iter_" f"{iter_now}") row.append(time_cost) if self.job.args.num_rescore_ligand: rescore_time = 'N/A' if iter_now == num_iter: rescore_time = get_time_cost( nodes_dict, f'RescoreNode_iter_{iter_now}') row.append(rescore_time) table.append(row) ncols, nrows = len(table[0]), len(table) width = [w * inch for w in [1.2] + [1.0] * (ncols - 1)] style = [ ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('TEXTCOLOR', (0, 0), (ncols - 1, 0), rh.gray), ('TEXTCOLOR', (0, 0), (0, nrows - 1), rh.gray) ] # yapf: disable table = Table(table, width, style=style, hAlign='CENTER') Ele.append(table)
[docs] def addRecoveryResults(self): """ Add the regression plot, recovery plot, recovery table and conclusion to the report. """ Ele = self.Elements rh.add_spacer(Ele) rh.add_spacer(Ele) rh.pargph(Ele, '<u>Test Set Results</u>') rh.add_spacer(Ele) train_node = self.job.finished_nodes[f'LigandMLTrainNode_iter_' \ f'{self.job.args.num_iter}'] model_file = train_node.input_for_next_node['model_file'] _, _, _, y_true, y_pred = get_ligand_ml_metric(model_file) rmse = metrics.mean_squared_error(y_true, y_pred[:, 0], squared=False) r2 = metrics.r2_score(y_true, y_pred[:, 0]) top_ratio_samples = [0.001, 0.002, 0.005] regress_text = f"This plot shows the true scores vs machine " \ f"learning model predictions of the test ligands. " \ f"The total number of test ligands is " \ f"{y_true.shape[0]}." \ f"The R squared (R2) is {r2:.2f}. The " \ f"root-mean-square error (RMSE) is {rmse:.2f}." recovery_text = "This plot shows how well the model is able to " \ "recovery the best test ligands by screening small " \ "portion of all the test ligands. The recovery " \ "of the full data set is usually better than " \ "the recovery calculated using the test set." table, recovery_results = make_regress_recovery_plots( y_true, y_pred, top_ratio_samples, regress_text, recovery_text) style = [ ('TOPPADDING', (0, 1), (-1, -1), 1), ('ALIGN', (0, 0), (-1, -1), 'CENTER') ] # yapf: disable rh.add_table(Ele, table, style, [4, 4]) screen_ratio_samples = [0.01, 0.02, 0.05, 0.10, 0.20] table, table_caption, best_enrichment =\ make_recovery_table(recovery_results, screen_ratio_samples) width = [w * inch for w in [1.5, 1.0, 1.0, 1.0, 1.0, 1.0]] ncols, nrows = len(table[0]), len(table) style = [ ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('TEXTCOLOR', (0, 0), (ncols - 1, 0), rh.gray), ('TEXTCOLOR', (0, 0), (0, nrows - 1), rh.gray) ] # yapf: disable reportlab_table = Table(table, width, style=style, hAlign='CENTER') rh.add_spacer(Ele) rh.add_spacer(Ele) rh.pargph( Ele, "Recovery percentage of best test ligands at " "different screening percentage:") rh.add_spacer(Ele) Ele.append(reportlab_table) rh.add_spacer(Ele) rh.pargph(Ele, table_caption) rh.add_spacer(Ele) rh.add_spacer(Ele) rh.pargph(Ele, '<u>Conclusion</u>') rh.add_spacer(Ele) conclusion = get_conclusion_string(best_enrichment, self.job.args.task) rh.pargph(Ele, conclusion)