Source code for schrodinger.application.desmond.solubility_utils

import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
import copy
import csv
from collections import defaultdict
from io import StringIO
from typing import Dict
from typing import List
from typing import Optional

import numpy as np

from schrodinger.application.desmond import cms
from schrodinger.application.desmond import constants
from schrodinger.application.desmond.measurement import Measurement
from schrodinger.application.desmond.correlation_tau import ConfidenceInterval
from schrodinger.structure import Structure
from schrodinger.structutils import analyze

# Default number of sublimation jobs
NUM_SUBLIMATION_JOBS = 5
NUM_POLYMER_SOLVATION_JOBS = 20

OBVIOUS_SASA_CUTOFF = 3.0

# Marker that indicates structure is soluble.
SOLUBLE_MARKER = "< 5kcal/mol"

ParsedReport = Dict[str, Dict]
COMPOUND_NAME = 'Compound Name'


[docs]def extract_solute_ct(cms_model: cms.Cms) -> Optional[Structure]: """ Returns a structure representing the solute component of the input cms file. The solute should come before the solvent if there are custom charges present. :param cms_model: Input cms object to extract the solute from. :return: Solute structure object or None if no solute is found """ solute_atom_list = cms_model.select_atom("solute") if solute_atom_list: # This preserves any custom charges if present ct = cms_model.fsys_ct.copy() del_atoms_idxs = [ i for i in range(1, ct.atom_total + 1) if i not in solute_atom_list ] ct.deleteAtoms(del_atoms_idxs) return ct
[docs]def get_transfer_free_energy(cms_model: cms.Cms) -> Optional[Measurement]: """ Returns the transfer free energy as a Measurement or None if the property could not be found. Depending on the leg that the cmd corresponds to, this could be sublimation, solvation, or hydration dGs. :param cms_model: Input cms object to extract the transfer free energy from. """ for ct in cms_model.comp_ct: transfer_free_ene = ct.property.get(constants.FEP_TRANSFER_FREE_ENERGY) if transfer_free_ene: return Measurement.from_string(transfer_free_ene)
[docs]def median_sublimation_free_energy( sublimation_dgs: List[Measurement]) -> Measurement: """ Returns the median sublimation free energy. For an odd number this is the median, for an even number this is the one closest to the average. :param sublimation_dgs: List of sublimation transfer free energy measurements. """ sublimation_vals = np.array([fene.val for fene in sublimation_dgs]) # Odd number of sublimation jobs, use the median if len(sublimation_dgs) % 2 == 1: median_idx = np.argmin( np.abs(sublimation_vals - np.median(sublimation_vals))) # Even number of sublimation jobs, use the one closer to the average else: median_idx = np.argmin( np.abs(sublimation_vals - np.average(sublimation_vals))) return sublimation_dgs[median_idx]
[docs]def calculate_solubility_free_energy( hydration_dg: Measurement, sublimation_dg: Measurement) -> Measurement: """ Returns the dissolution free energy. NOTE: - "Sublimation" leg actually simulates deposition (gas -> solid) - Solvation/hydration legs simulate solvation (solid -> solvated) - Dissolution ("solubility") free energy = Hydration - "Sublimation" :param hydration_dg: Hydration transfer free energy measurement. :param sublimation_dg: Sublimation transfer free energy measurement. """ return hydration_dg - sublimation_dg
[docs]def set_structure_soluble(ct: Structure) -> None: """ Update the fep solubility property to mark the structure as being soluble. The input ct is modified in place. :param ct: Structure to mark as soluble. """ ct.property[constants.FEP_SOLUBILITY] = SOLUBLE_MARKER
[docs]def is_structure_soluble(ct: Structure) -> bool: """ Return True if the structure has been marked as soluble, False otherwise. :param ct: Structure to check if marked as soluble. """ return ct.property.get(constants.FEP_SOLUBILITY) == SOLUBLE_MARKER
[docs]def is_soluble(exposures: List[float]) -> bool: """ If more than 10% of the solute molecules have a exposure greater than 90.0, flag the molecule as soluble (< 5 kcal/mol transfer energy). :param exposures: List of exposure for each molecule in the structure. :return: True if this molecule is considered soluble, False otherwise. """ exposed_mols = len([1 for score in exposures if score > 90.0]) percent_exposed = float(exposed_mols) / float(len(exposures)) * 100.0 return percent_exposed > 10.0
[docs]def get_molecule_exposures(baseline_ct: Structure, exposure_ct: Structure, delete_h_atoms=True) -> List[float]: """ Calculates the exposure of each molecule in a structure based on SASA and returns a list of exposure of each molecule by index of each molecule in the structure - 1 (since structure.molecule is a 1-indexed list). By default this functiion deletes all H atoms from the specified structure. To retain H atoms set delete_h_atoms to False. :param baseline_ct: Structure of an individual molecule used as the reference. :param exposure_ct: Structure used for the list of molecule exposures. :param delete_h_atoms: Set to True to delete all H atoms from the structure when calculating SASA. :return: List of exposure for each molecule in the structure. """ exposures = [] exposure_ct = exposure_ct.copy() baseline_ct = baseline_ct.copy() if delete_h_atoms: for ct in [exposure_ct, baseline_ct]: ct.deleteAtoms(analyze.evaluate_asl(ct, "atom.ele H")) for mol in exposure_ct.molecule: score = 0.0 mol_sasa = analyze.calculate_sasa(exposure_ct, mol.atom) rel_mol_sasa = mol_sasa / len(mol.atom) for atom in mol.atom: # Get SASA of original exposure_ct atom for comparison baseline_atom_idx = atom.index % baseline_ct.atom_total or baseline_ct.atom_total baseline_atom = baseline_ct.atom[baseline_atom_idx] baseline_sasa = analyze.calculate_sasa(baseline_ct, [baseline_atom]) if rel_mol_sasa < OBVIOUS_SASA_CUTOFF: atom_sasa = 0.0 else: atom_sasa = analyze.calculate_sasa(exposure_ct, [atom]) if atom_sasa > baseline_sasa: score += 1.0 score = 100.0 * (score / float(len(mol.atom))) exposures.append(score) return exposures
[docs]def extract_most_exposed_solute_molecules(solute_ct: Structure, exposures: List[float], extract_count=5) -> List[Structure]: """ Takes the solute component of an input system and returns the `extract_count` most exposed molecules ordered by exposure. If less than `extract_count` solute molecules are present, the returned list contains all solute molecules sorted by exposure. :param solute_ct: Input ct file containing the system to process. :param exposures: Input ct file containing the system to process. :param extract_count: The number of molecules to extract. :return: List of `extract_count` most exposed solute mols. """ ranked = sorted(exposures, reverse=True)[:extract_count] first_st = True solute_cts = [] for score in ranked: score_idx = exposures.index(score) # remove score from list in case there are duplicates exposures[score_idx] = None mol_idx = score_idx + 1 mol_st = copy.copy(solute_ct) for imol, mol in enumerate(mol_st.molecule, start=1): if imol == mol_idx: ffio_val = 1 else: ffio_val = 0 for atom in mol.atom: atom.property[constants.FEP_ABSOLUTE_ENERGY] = ffio_val atom.property[constants.FEP_ABSOLUTE_LIGAND] = ffio_val solute_cts.append(mol_st) return solute_cts
[docs]def get_header(max_num_sublimation: int): line = ( "Compound Name,Dissolution free energy,Dissolution free energy err," "Hydration free energy," + "Hydration free energy err,Median Sublimation free energy,Median Sublimation free energy err" ) if max_num_sublimation: sublime_line = ',' + ",".join( f"Sublimation free energy {i+1},Sublimation free energy {i+1} err" for i in range(max_num_sublimation)) line += sublime_line line += '\n' return line
[docs]def get_header_solvation(max_num_solvation: int): line = ( "Compound Name,Mean Solvation free energy,Mean Solvation free energy " "lower bound,Mean Solvation free energy upper bound") if max_num_solvation: line += ',' + ','.join( f"Solvation free energy {i+1}, Solvation free energy {i+1} err" for i in range(max_num_solvation)) line += '\n' return line
[docs]def get_report(title: str, hydration_dg: Optional[Measurement] = None, sublimation_dgs: Optional[List[Measurement]] = None, solvation_dgs: Optional[List[Measurement]] = None, is_soluble: bool = False) -> str: """ Returns the formatted results as csv file contents. :param title: Title of the ligand. :param hydration_dg: Hydration transfer free energy measurement. Must be specified if is_soluble is False. :param sublimation_dgs: List of sublimation transfer free energy measurements. Must be specified if is_soluble is False. :param solvation_dgs: List of solvation transfer free energy measurements. :param is_soluble: Set to True if this ligand was determined to be soluble in the MD stage. """ from itertools import chain if is_soluble: line = f"{title},{SOLUBLE_MARKER}\n" elif solvation_dgs: bs_solvation_dg = bootstrapped_solvation_free_energy(solvation_dgs) bs_str = f'{bs_solvation_dg.val},{bs_solvation_dg.lower_bound},{bs_solvation_dg.upper_bound},' results = [f'{dg.val},{dg.unc}' for dg in solvation_dgs] line = (f"{title}," + bs_str + ",".join(results) + "\n") else: if sublimation_dgs: median_sublimation_dg = median_sublimation_free_energy( sublimation_dgs) dissolution_dg = calculate_solubility_free_energy( hydration_dg, median_sublimation_dg) else: median_sublimation_dg = 'N/A+-N/A' dissolution_dg = 'N/A+-N/A' line = (f"{title}," + ",".join( f"{str(d).replace('+-', ',')}" for d in chain((dissolution_dg, hydration_dg, median_sublimation_dg), sublimation_dgs)) + "\n") return line
[docs]def parse_report(csv_contents: str) -> ParsedReport: """ Parse a report csv and return a `ParsedReport` mapping the ligand to the result. Each result is a dictionary where the keys are the columns from the csv. """ data = {} f = StringIO(csv_contents) for row in csv.DictReader(f): parsed_row = {} for k, v in row.items(): if k is not None: k = k.strip() if v is not None: v = v.strip() if k == COMPOUND_NAME: continue if v == 'N/A': v = None if v is not None and v != SOLUBLE_MARKER: v = float(v) parsed_row[k] = v data[row[COMPOUND_NAME]] = parsed_row return data
[docs]def calculate_report_rmse(ref_data: ParsedReport, test_data: ParsedReport) -> Dict[str, float]: """ Calculate the rsme between the parsed reference data and the test data. Missing data is treated as infinite difference Used in STU and SCIVAL tests. :return: Dictionary where the keys are the column names and the values are the rmse for that column. """ from schrodinger.application.scisol.packages.fep import fep_stats result = {} for col in list(ref_data.values())[0].keys(): if col == COMPOUND_NAME: continue ref_array = [] test_array = [] for compound in ref_data.keys(): ref_val = ref_data[compound][col] # Handle soluble compounds if ref_val == SOLUBLE_MARKER or ref_val is None: ref_array.append(0.0) # If both marked soluble, set as 0 difference # otherwise set as inf difference. if test_data[compound][col] == ref_val: test_array.append(0.0) else: test_array.append(float('inf')) continue # Not marked soluble ref_array.append(float(ref_val)) if col in test_data[compound]: val = test_data[compound][col] # Should not be marked soluble, if it is # then give inf difference. if val == SOLUBLE_MARKER or val is None: test_array.append(float('inf')) else: test_array.append(float(val)) else: # Treat missing data as inf difference test_array.append(float('inf')) _, rmse = fep_stats.calc_stats( np.abs(np.array(ref_array) - np.array(test_array))) result[col] = rmse.val return result
[docs]def parse_output_structure_properties(sts: List[Structure]) -> ParsedReport: """ Parse the ct level solubility properties from report structures and return the dictionary of {compound: {property: value}}. Missing values are set to None. Used in STU and SCIVAL tests. """ FEP_SOLUBILITY_ERR = f'{constants.FEP_SOLUBILITY} err' result = defaultdict(dict) for st in sts: dg_dissolution = st.property.get(constants.FEP_SOLUBILITY) if dg_dissolution == SOLUBLE_MARKER: result[st.title][constants.FEP_SOLUBILITY] = SOLUBLE_MARKER result[st.title][FEP_SOLUBILITY_ERR] = None result[st.title][constants.FEP_SOLUBILITY_MICROMOLAR] = None continue elif dg_dissolution is not None: dg_dissolution = Measurement.from_string(dg_dissolution) result[st.title][constants.FEP_SOLUBILITY] = dg_dissolution.val result[st.title][FEP_SOLUBILITY_ERR] = dg_dissolution.unc else: result[st.title][constants.FEP_SOLUBILITY] = None result[st.title][FEP_SOLUBILITY_ERR] = None result[st.title][constants.FEP_SOLUBILITY_MICROMOLAR] = st.property.get( constants.FEP_SOLUBILITY_MICROMOLAR) return result
[docs]def bootstrapped_solvation_free_energy( solvation_dgs: List[Measurement]) -> ConfidenceInterval: data = np.array([dg.val for dg in solvation_dgs]) result = bs.bootstrap(data, stat_func=bs_stats.mean, is_pivotal=False) return ConfidenceInterval(result.value, result.lower_bound, result.upper_bound)