Source code for schrodinger.application.matsci.uq_utils

"""
Utilities for uncertainty quantification.

Copyright Schrodinger, LLC. All rights reserved.
"""

import math
import warnings
from collections import OrderedDict

import numpy
from scipy import stats
from scipy.optimize import OptimizeWarning
from scipy.optimize import curve_fit

N_BINS = 25

MODE_KEY = 'mode'
TG_LABEL = 'Tg'
TG_MODE = 'tg'
YIELD_STRAIN_LABEL = 'Yield Strain'
YIELD_STRAIN_MODE = 'yield_strain'
MODES = OrderedDict([(TG_LABEL, TG_MODE),
                     (YIELD_STRAIN_LABEL, YIELD_STRAIN_MODE)])

X_BIAS_F = lambda x: math.pow(x, 5. / 4.)


[docs]def check_property_sign(prop, points): """ Return True if the property has the correct sign. :type prop: float :param prop: the property :type points: list :param points: (x, y) pair tuples of data :rtype: bool :return: True if the property has the correct sign """ # all x values should have the same sign, always # positive for Tg and either always positive or always # negative for yield strain return numpy.sign(prop) == numpy.sign(points[0][0])
[docs]def get_nonzero_data(xs, ys): """ Return nonzero data. :type xs: list :param xs: the domain :type ys: list :param ys: the range :rtype: list, list :return: the domain and range """ assert len(xs) == len(ys) # for example, the sign of the strain x-data indicates # whether it is compression or expansion, all x will have # the same sign, for uncertainty quantification the domain # and range must be non-zero (see MATSCI-8423) data = [(x, y) for x, y in zip(xs, ys) if x and y] if data: xs, ys = zip(*data) else: xs, ys = [], [] if xs: positive = [x > 0 for x in xs] if any(positive) and not all(positive): xs, ys = [], [] return xs, ys
[docs]def get_bilinear_regions(original_data): """ Gets the initial early and late regions such that the R squared values are maximized. :type original_data: list :param original_data: contains pair tuples of data points :rtype: tuple, tuple :return: (min, max) tuples for early and late regions """ max_r2 = 0 max_r2_index = -1 original_data.sort() for i in range(len(original_data)): if i > 2: _, _, early_r, _, _ = stats.linregress( [point[0] for point in original_data[:i]], [point[1] for point in original_data[:i]]) else: early_r = 0 if i < len(original_data) - 2: _, _, late_r, _, _ = stats.linregress( [point[0] for point in original_data[i:]], [point[1] for point in original_data[i:]]) else: late_r = 0 early_r2 = early_r * early_r late_r2 = late_r * late_r if abs(early_r2) + abs(late_r2) > max_r2: max_r2 = abs(early_r2) + abs(late_r2) max_r2_index = i if original_data: pad = (original_data[max_r2_index][0] - original_data[max_r2_index - 1][0]) * 0.25 early_min = original_data[0][0] - pad early_max = original_data[max_r2_index - 1][0] + pad early_region = (early_min, early_max) late_min = original_data[max_r2_index][0] - pad late_max = original_data[-1][0] + pad late_region = (late_min, late_max) else: early_region = (0, 0) late_region = (0, 0) return early_region, late_region
[docs]def get_points_in_region(data, region): """ Determine which data points fall into a region given its lower and upper bound. :param data: contains pair tuples of data points :type data: list :param region: a tuple containing the lower and upper boundaries points of the region :type region: tuple :return: all points within the region :rtype: list """ points_in_region = [] for point in data: if point[0] > region[0] and point[0] < region[1]: points_in_region.append(point) return points_in_region
[docs]def get_bilinear_regression(data): """ Return the regression information. :type data: list :param data: contains pair tuples of data points :rtype: tuple :return: (slope, intercept, R^2) """ if len(data) > 1: if len(data) == 2: slope = (data[1][1] - data[0][1]) / \ (data[1][0] - data[0][0]) intercept = data[0][1] - (slope * data[0][0]) r = 0 else: slope, intercept, r, _, _ = stats.linregress( [point[0] for point in data], [point[1] for point in data]) return (slope, intercept, r * r)
[docs]def get_bilinear_prop(early_regression, late_regression): """ Returns the bilinear property if both regressions are available. :type early_regression: tuple :param early_regression: (slope, intercept, R^2) for early part :type late_regression: tuple :param late_regression: (slope, intercept, R^2) for late part :return: the property :rtype: float """ if early_regression and late_regression: prop = (early_regression[1] - late_regression[1]) / \ (late_regression[0] - early_regression[0]) return prop
[docs]def get_bilinear_fit_property(x, x_intersection, early_regression, late_regression, invert_y=True): """ Return the fit property given the x-value. :type x: float :param x: the x-value :type x_intersection: float :param x_intersection: the x intersection point :type early_regression: tuple :param early_regression: (slope, intercept, R^2) for early part :type late_regression: tuple :param late_regression: (slope, intercept, R^2) for late part :type invert_y: bool :param invert_y: whether to invert the y-values :rtype: float or None :return: the fit property value or None if there isn't one """ if x_intersection is None: return if x <= x_intersection: slope = early_regression[0] intercept = early_regression[1] else: slope = late_regression[0] intercept = late_regression[1] y_value = slope * x + intercept if invert_y: return 1. / y_value else: return y_value
[docs]def get_bilinear_prop_from_points(points, early_region=None, late_region=None): """ Return a bilinear property using the given points. :type points: list :param points: list of (x,y) tuples :type early_region: tuple or None :param early_region: (min, max) tuple for early region, if None then automatically determined :type late_region: tuple or None :param late_region: (min, max) tuple for late region, if None then automatically determined :rtype: float or None :return: the property or None if there isn't one """ assert not bool(early_region) ^ bool(late_region) if not early_region and not late_region: early_region, late_region = get_bilinear_regions(points) early_points = get_points_in_region(points, early_region) late_points = get_points_in_region(points, late_region) early_regression = get_bilinear_regression(early_points) late_regression = get_bilinear_regression(late_points) prop = get_bilinear_prop(early_regression, late_regression) if prop and check_property_sign(prop, points): return prop
[docs]def get_min_max_avg(data): """ Return (min, max, avg) of the given data. :type data: list :param data: the data :rtype: float, float, float :return: the min, max, avg """ return min(data), max(data), sum(data) / len(data)
[docs]def get_hyperbola_regression(data, avg_x): """ Return the regression information. :type data: list :param data: contains pair tuples of data points :type avg_x: float :param avg_x: the average x-value :rtype: tuple :return: (t0, a, b, c, p0) parameters """ if not data or avg_x is None: return avg_x = abs(avg_x) x_data, y_data = zip(*data) x_data_rescaled = [x / avg_x for x in x_data] with warnings.catch_warnings(): warnings.simplefilter('error', OptimizeWarning) try: popt, pcov = curve_fit(calc_property, x_data_rescaled, y_data) except (RuntimeError, TypeError, OptimizeWarning): return t0, a, b, c, p0 = popt t0 *= avg_x a /= avg_x b /= avg_x c += 2 * math.log(avg_x, math.e) return (t0, a, b, c, p0)
[docs]def calc_property(x_scaled, t0, a, b, c, p0): """ Calculate the hyperbolic property. :param x_scaled: the scaled x-value :type x_scaled: float :param t0: the scaled T_0 factor in polynomial :type t0: float :param a: a factor in polynomial :type a: float :param b: b factor in polynomial :type b: float :param c: c factor in polynomial :type c: float :param p0: p_0 factor in polynomial :type p0: float """ # the functional form of this hyperbola was original set according # to Patrone et al., Polymer 87 246 (2016) for Tg and has 5 free # parameters, for now also use this form for Yield Strain (because # it is more general) even though in Patrone et al., Polymer 116 # 295 (2017) a form featuring 4 free parameters is used delta_t = x_scaled - t0 c_term = math.e**c + 0.25 * delta_t**2 b_term = b * (0.5 * delta_t + c_term**0.5) a_term = a * delta_t return p0 + a_term + b_term
[docs]def get_hyperbola_fit_property(x, regression): """ Return the fit property given the x-value. :type x: float :param x: the x-value :type regression: tuple :param regression: (t0, a_coeff, b_coeff, c_coeff, p0) :rtype: float or None :return: the fit property value or None if there isn't one """ if regression is None: return return calc_property(x, *regression)
[docs]def get_hyperbola_prop_from_points(points): """ Return a hyperbola property using the given points. :type points: list :param points: list of (x,y) tuples :rtype: float or None :return: the property or None if there isn't one """ x_vals = [point[0] for point in points] _, _, avg_x = get_min_max_avg(x_vals) regression = get_hyperbola_regression(points, avg_x) if regression and check_property_sign(regression[0], points): return regression[0]
[docs]class BootstrapNoiseHistogramData(object): """ Manage data of a histogram of a quantity derived from a parametric bootstrap analysis of uncorrelated Gaussian white noise. """ SAMPLE_SIZE = 1000
[docs] def __init__(self): """ Create an instance. """ self.original_data = [] self.fit_function = None self._residuals = [] self.x_bias_function = lambda x: x self._cov_matrix = None self.seed = None self.sample_size = self.SAMPLE_SIZE self.target_function = None
[docs] def setOriginalData(self, points): """ Set the original data used to derive the histogram. :type points: list :param points: list of (x,y) tuples """ self.original_data = list(points)
[docs] def setFitFunction(self, fit_function): """ Set the fit function, i.e. the function that was fit to the original data. :type fit_function: function :param fit_function: the fit function """ self.fit_function = fit_function
def _setResiduals(self): """ Set the residuals. """ self._residuals = [ y - self.fit_function(x) for x, y in self.original_data ]
[docs] def setXBiasFunction(self, x_bias_function=None): """ Set a biasing function for the original x data. :type x_bias_function: function or None :param x_bias_function: a biasing function for the original x data or None if there isn't one """ if not x_bias_function: x_bias_function = lambda x: x self.x_bias_function = x_bias_function
def _setCovMatrix(self): """ Set the covariance matrix. """ # since the residuals are uncorrelated the covariance # matrix is diagonal all_x = list(zip(*self.original_data))[0] variance = 0.0 for an_x, residual in zip(all_x, self._residuals): variance += (residual / self.x_bias_function(an_x))**2 variance /= len(all_x) diagonal = [variance * self.x_bias_function(i)**2 for i in all_x] self._cov_matrix = numpy.diag(diagonal)
[docs] def seedRandom(self, seed=None): """ Seed the random number generator. :type seed: int or None :param seed: the seed """ # if None then it will try to read data from /dev/urandom (or # the Windows analogue) otherwise from the clock self.seed = seed numpy.random.seed(seed=self.seed)
[docs] def setSampleSize(self, size=None): """ Set the sample size. :type size: int or None :param size: the sample size, if None then the default of SAMPLE_SIZE is used """ if not size: self.sample_size = self.SAMPLE_SIZE else: self.sample_size = size
def _getNoiseVectors(self): """ Return white noise vectors sampled from a Gaussian multi-variate probability density function. :rtype: numpy.array :return: white noise vectors with a shape of (sample_size, len(original_data)) """ # the Gaussian white noise has zero mean, given that the # covariance matrix is diagonal the probability density # function is a product of functions mean = numpy.zeros(len(self.original_data)) return numpy.random.multivariate_normal(mean, self._cov_matrix, self.sample_size) def _getNoisyFitData(self): """ Return samples of the fit data which include noise. :rtype: list :return: list containing lists of (x,y) tuples """ # the manner in which noise is applied was original set according # to Patrone et al., Polymer 87 246 (2016) for Tg, do the same for # Yield Strain for now as it appears to work well, even though in # Patrone et al., Polymer 116 295 (2017) the noise is applied slightly # differently all_data = [] all_noises = self._getNoiseVectors() for noises in all_noises: data = [] pairs = list(zip(self.original_data, noises)) for (x, y), noise in pairs: data.append((x, self.fit_function(x) + noise)) all_data.append(data) return all_data
[docs] def setTargetFunction(self, target_function): """ Set the target function, i.e. the function used to obtain the target quantity from points. :type target_function: function :param target_function: the target function """ # if target function returns None when experiencing # an issue in computing the target value this # will trigger a resampling, see ._cleanTargetValues # method self.target_function = target_function
def _cleanTargetValues(self, targets): """ Clean the target values. :type targets: list :param targets: the target values :rtype: list or None :return: cleaned target values or None if there isn't one """ # in case the target_function can return None # replace these missing values by randomly sampling # the values that are present if such values are # present or returning early if not cleaned = list(targets) n_bad = cleaned.count(None) if n_bad: cleaned = [x for x in cleaned if x is not None] if not cleaned: return replace = list( numpy.random.choice(cleaned, size=n_bad, replace=True)) cleaned.extend(replace) return cleaned def _getSampledTargets(self): """ Return sampled target quantity values. :rtype: list or None :return: list of sampled target values or None if there isn't one """ data = self._getNoisyFitData() targets = [self.target_function(points) for points in data] return self._cleanTargetValues(targets)
[docs] def getHistogramData(self): """ Return the histogram data which is just a list of sampled target values. :raise RuntimeError: if required variables are not defined :rtype: list or None :return: list of sampled target values or None if there isn't one """ if not self.original_data: msg = ('No data specified.') elif not self.fit_function: msg = ('No fit function specified.') elif not self.target_function: msg = ('No target function specified.') else: msg = None if msg: raise RuntimeError(msg) self._setResiduals() self._setCovMatrix() return self._getSampledTargets()
[docs] @staticmethod def getSampleHistogram(points, fit_function, target_function, x_bias_function=None, seed=None, size=None): """ Return a sample of histogram data using the given points, fit function, and target function. :type points: list :param points: list of (x,y) tuples :type fit_function: function :param fit_function: the fit function :type target_function: function :param target_function: the target function :type x_bias_function: function or None :param x_bias_function: a biasing function for the original x data or None if there isn't one :type seed: int or None :param seed: the seed :type size: int or None :param size: the sample size, if None then the default of SAMPLE_SIZE is used :rtype: list or None :return: list of sampled target values or None if there isn't one """ data_obj = BootstrapNoiseHistogramData() data_obj.setOriginalData(points) data_obj.setFitFunction(fit_function) data_obj.setTargetFunction(target_function) data_obj.setXBiasFunction(x_bias_function=x_bias_function) data_obj.seedRandom(seed=seed) data_obj.setSampleSize(size=size) return data_obj.getHistogramData()
@staticmethod def _evalNormal(an_x, mu, sigma): """ Return an_x evaluated from a normal distribution with mean mu and standard deviation sigma. :type an_x: float or numpy.array :param an_x: the an_x value(s) :type mu: float :param mu: the mean of the normal distribution :type sigma: float :param sigma: the standard deviation of the normal distribution :rtype: float or numpy.array :return: an_x evaluation from the normal distribution """ return stats.norm.pdf(an_x, loc=mu, scale=sigma)
[docs] @staticmethod def getNormalParameters(data): """ Return the mean, standard deviation, covariance matrix, and figure of merit parameters from a normal fit of the given data. :type data: list :param data: contains (x, y) tuples :rtype: float, float, numpy.array, float or None, None, None, and None :return: the mean, standard deviation, covariance matrix, and figure of merit or None, None, None, and None if there aren't any """ none = (None,) * 4 if len(data) == 1: return none x, y = list(zip(*data)) avg_x = numpy.mean(x) std_x = numpy.std(x) p0 = [avg_x, std_x] # guess solution with warnings.catch_warnings(): warnings.simplefilter('error', OptimizeWarning) try: (mu, sigma), cov, info_dict, err_str, i_err = \ curve_fit(BootstrapNoiseHistogramData._evalNormal, x, y, p0=p0, full_output=True) except (RuntimeError, OptimizeWarning): return none residuals = info_dict['fvec'] fom = BootstrapNoiseHistogramData._getNormalStandardError(residuals) return mu, sigma, cov, fom
@staticmethod def _getNormalStandardError(residuals): """ Return the normal standard error of the given residuals from a fit. :type residuals: numpy.array :param residuals: the residuals of a fit, i.e. (y_data - y_fit) :rtype: float :return: the standard error """ # TODO - implement reduced chi-squared? return math.sqrt(numpy.dot(residuals, residuals) / len(residuals))
[docs]class BilinearHistogramData(BootstrapNoiseHistogramData): """ Manage a histogram using bilinear data. """
[docs] def __init__(self, invert_y=True): """ Create an instance. :type invert_y: bool :param invert_y: whether to invert the y-values """ super(BilinearHistogramData, self).__init__() self.invert_y = invert_y
def _getSampledTargets(self): """ Return sampled target quantity values. :rtype: list or None :return: list of sampled target values or None if there isn't one """ if self.invert_y: datas = self._getNoisyFitData() new = [] for data in datas: new.append([(x, 1. / y) for x, y in data]) targets = [self.target_function(points) for points in new] return self._cleanTargetValues(targets) else: return super()._getSampledTargets()
[docs] @staticmethod def getSampleHistogram(points, fit_function, target_function, x_bias_function=None, seed=None, size=None, invert_y=True): """ Return a sample of histogram data using the given points, fit function, and target function. :type points: list :param points: list of (x,y) tuples, if using invert_y then this data should be uninverted :type fit_function: function :param fit_function: the fit function, if using invert_y then expected to return uninverted data :type target_function: function :param target_function: the target function, if using invert_y then expects the input data to already be inverted :type x_bias_function: function or None :param x_bias_function: a biasing function for the original x data or None if there isn't one :type seed: int or None :param seed: the seed :type size: int or None :param size: the sample size, if None then the default of SAMPLE_SIZE is used :type invert_y: bool :param invert_y: whether to invert the y-values :rtype: list or None :return: list of sampled target values or None if there isn't one """ data_obj = BilinearHistogramData(invert_y=invert_y) data_obj.setOriginalData(points) data_obj.setFitFunction(fit_function) data_obj.setTargetFunction(target_function) data_obj.setXBiasFunction(x_bias_function=x_bias_function) data_obj.seedRandom(seed=seed) data_obj.setSampleSize(size=size) return data_obj.getHistogramData()
[docs]class HyperbolaHistogramData(BootstrapNoiseHistogramData): """ Manage a histogram using hyperbola data. """
[docs] @staticmethod def getSampleHistogram(points, fit_function, target_function, x_bias_function=None, seed=None, size=None): """ Return a sample of histogram data using the given points, fit function, and target function. :type points: list :param points: list of (x,y) tuples :type fit_function: function :param fit_function: the fit function :type target_function: function :param target_function: the target function :type x_bias_function: function or None :param x_bias_function: a biasing function for the original x data or None if there isn't one :type seed: int or None :param seed: the seed :type size: int or None :param size: the sample size, if None then the default of SAMPLE_SIZE is used :rtype: list or None :return: list of sampled target values or None if there isn't one """ data_obj = HyperbolaHistogramData() data_obj.setOriginalData(points) data_obj.setFitFunction(fit_function) data_obj.setTargetFunction(target_function) data_obj.setXBiasFunction(x_bias_function=x_bias_function) data_obj.seedRandom(seed=seed) data_obj.setSampleSize(size=size) return data_obj.getHistogramData()