Source code for schrodinger.application.desmond.correlation_tau

"""
This program computes the average correlation coefficient and
the kendall tau rank coefficient between experiment and prediction samples.
Samples are randomly drawn from gaussian distribution centered on each
experimental data point with given error.

The algorithm used in this program is based on the work by
Scott P. Brown, Steven W. Muchmore, Philip J. Hajduk
Drug Discovery Today, Vol. 14, No. 7-8., pp. 420-427

Copyright Schrodinger, LLC. All rights reserved.

"""
# Contributors: Byungchan Kim, Robert Abel, Lingle Wang, Teng Lin

import numpy
from scipy import stats
from dataclasses import dataclass


[docs]@dataclass class ConfidenceInterval: val: float lower_bound: float upper_bound: float def __str__(self): return f"{self.val:.2f}_{self.lower_bound:.2f}^{self.upper_bound:.2f}"
[docs]def predict_kendall_tau(experiment, experiment_sigma=0.3, prediction_sigma=0.3, num_sample=1000): """ Computes the average Kendall tau rank correlation coefficient between experiment and prediction samples. num_sample independent data for each experiment and prediction are sampled from gaussian distribution with experiment_sigma and prediction_sigma error. :param experiment: sequence of experiment data :param experiment_sigma: experimental error :param prediction_sigma: prediction error :param num_sample: number of samples :return: average_tau, sigma_tau """ experiment_data, prediction_data = _prepare_exp_pre_sample( experiment, experiment_sigma, prediction_sigma, num_sample) # kendall tau test kendall_tau = [ stats.kendalltau(exp, pre)[0] for exp, pre in zip(experiment_data, prediction_data) ] return numpy.average(kendall_tau), numpy.std(kendall_tau)
def _get_ci(data): avg = numpy.average(data) ci_25 = numpy.percentile(data, 2.5) ci_975 = numpy.percentile(data, 97.5) return ConfidenceInterval(avg, ci_25, ci_975)
[docs]def predict_expected_slope(experiment, experiment_sigma=0.3, prediction_sigma=0.3, num_sample=1000): # add random noise to experiment and prediction data experiment_data, prediction_data = _prepare_exp_pre_sample( experiment, experiment_sigma, prediction_sigma, num_sample) slopes = [ numpy.polyfit(e, p, 1)[0] for e, p in zip(experiment_data, prediction_data) ] return _get_ci(slopes)
[docs]def predict_correlation( experiment, experiment_sigma=0.3, prediction_sigma=0.3, num_sample=1000, return_ci=False, ): """ Computes the average correlation coefficient between experiment and prediction samples. num_sample independent data for each experiment and prediction are sampled from gaussian distribution with experiment_sigma and prediction_sigma error. :param experiment: sequence of experiment data :param experiment_sigma: experimental error :param prediction_sigma: prediction error :param num_sample: number of samples :param return_ci: If True, return the data in ConfidenceInterval format: <R>, <R^2>, <R^2_signed> If False, return: <R>, sigma_R, <R^2>, sigma_R^2, <R^2_signed>, sigma_R^2_signed """ experiment_data, prediction_data = _prepare_exp_pre_sample( experiment, experiment_sigma, prediction_sigma, num_sample) # compute correlation matrix among experiment and prediction samples correlation_matrix = numpy.corrcoef(experiment_data, prediction_data) # extract correlation matrix only between experiment and prediction samples exp_pre_correlation = correlation_matrix[:num_sample, num_sample:] exp_pre_correlation_square = numpy.square(exp_pre_correlation) exp_pre_abs_correlation = numpy.absolute(exp_pre_correlation) exp_pre_signed_correlation = numpy.multiply(exp_pre_correlation, exp_pre_abs_correlation) if return_ci: return (_get_ci(exp_pre_correlation), _get_ci(exp_pre_correlation_square), _get_ci(exp_pre_signed_correlation)) # yapf: disable else: return (numpy.average(exp_pre_correlation), numpy.std(exp_pre_correlation), numpy.average(exp_pre_correlation_square), numpy.std(exp_pre_correlation_square), numpy.average(exp_pre_signed_correlation), numpy.std(exp_pre_signed_correlation))
def _prepare_exp_pre_sample(experiment, experiment_sigma=0.3, prediction_sigma=0.3, num_sample=1000): """ :param experiment: sequence of experiment data :param experiment_sigma: experimental error :param prediction_sigma: prediction error :param num_sample: number of samples :return: experiment_sample, prediction_sample """ experiment = numpy.array(experiment) # random numbers from normal distribution numpy.random.seed(2017) experiment_random = numpy.random.normal(0.0, experiment_sigma, (num_sample, len(experiment))) prediction_random = numpy.random.normal(0.0, prediction_sigma, (num_sample, len(experiment))) # add random noise to experiment data experiment_sample = numpy.add(experiment, experiment_random) prediction_sample = numpy.add(experiment, prediction_random) return experiment_sample, prediction_sample
[docs]def compute_rmse(experiment, prediction): """ Computes root mean square error between experiment and prediction. Averages of experiment and prediction are aligned before RMSE computation. :param experiment: sequence of experiment data :param prediction: sequence of prediction data :return: root_mean_square_error between experiment and prediction """ experiment = numpy.array(experiment) prediction = numpy.array(prediction) experiment_average = numpy.average(experiment) prediction_average = numpy.average(prediction) offset = experiment_average - prediction_average return numpy.std(prediction - experiment + offset)