Source code for schrodinger.application.desmond.solubility_utils

import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
import copy
import csv
from collections import defaultdict
from io import StringIO
from typing import Dict
from typing import List
from typing import Optional

import numpy as np

from schrodinger.application.desmond import cms
from schrodinger.application.desmond import constants
from schrodinger.application.desmond.measurement import Measurement
from schrodinger.application.desmond.correlation_tau import ConfidenceInterval
from schrodinger.structure import Structure
from schrodinger.structutils import analyze

# Default number of sublimation jobs
NUM_SUBLIMATION_JOBS = 5
NUM_POLYMER_SOLVATION_JOBS = 20

OBVIOUS_SASA_CUTOFF = 3.0

# Marker that indicates structure is soluble.
SOLUBLE_MARKER = "< 5kcal/mol"

ParsedReport = Dict[str, Dict]
COMPOUND_NAME = 'Compound Name'


[docs]def extract_solute_ct(cms_model: cms.Cms) -> Optional[Structure]:
    """
    Returns a structure representing the solute component of
    the input cms file. The solute should come before the solvent
    if there are custom charges present.

    :param cms_model: Input cms object to extract the solute from.

    :return: Solute structure object or None if no solute is found

    """
    solute_atom_list = cms_model.select_atom("solute")
    if solute_atom_list:
        # This preserves any custom charges if present
        ct = cms_model.fsys_ct.copy()
        del_atoms_idxs = [
            i for i in range(1, ct.atom_total + 1) if i not in solute_atom_list
        ]
        ct.deleteAtoms(del_atoms_idxs)
        return ct


[docs]def get_transfer_free_energy(cms_model: cms.Cms) -> Optional[Measurement]:
    """
    Returns the transfer free energy as a Measurement or None if the property
    could not be found. Depending on the leg that the cmd corresponds to,
    this could be sublimation, solvation, or hydration dGs.

    :param cms_model: Input cms object to extract the transfer free energy from.

    """
    for ct in cms_model.comp_ct:
        transfer_free_ene = ct.property.get(constants.FEP_TRANSFER_FREE_ENERGY)
        if transfer_free_ene:
            return Measurement.from_string(transfer_free_ene)


[docs]def median_sublimation_free_energy(
        sublimation_dgs: List[Measurement]) -> Measurement:
    """
    Returns the median sublimation free energy. For an odd
    number this is the median, for an even number this is the
    one closest to the average.

    :param sublimation_dgs: List of sublimation transfer free energy
                            measurements.

    """
    sublimation_vals = np.array([fene.val for fene in sublimation_dgs])
    # Odd number of sublimation jobs, use the median
    if len(sublimation_dgs) % 2 == 1:
        median_idx = np.argmin(
            np.abs(sublimation_vals - np.median(sublimation_vals)))
    # Even number of sublimation jobs, use the one closer to the average
    else:
        median_idx = np.argmin(
            np.abs(sublimation_vals - np.average(sublimation_vals)))
    return sublimation_dgs[median_idx]


[docs]def calculate_solubility_free_energy(
        hydration_dg: Measurement, sublimation_dg: Measurement) -> Measurement:
    """
    Returns the dissolution free energy.

    NOTE:
        - "Sublimation" leg actually simulates deposition (gas -> solid)
        - Solvation/hydration legs simulate solvation (solid -> solvated)
        - Dissolution ("solubility") free energy = Hydration - "Sublimation"

    :param hydration_dg: Hydration transfer free energy measurement.

    :param sublimation_dg: Sublimation transfer free energy measurement.

    """
    return hydration_dg - sublimation_dg


[docs]def set_structure_soluble(ct: Structure) -> None:
    """
    Update the fep solubility property to mark the structure as being soluble.
    The input ct is modified in place.

    :param ct: Structure to mark as soluble.

    """
    ct.property[constants.FEP_SOLUBILITY] = SOLUBLE_MARKER


[docs]def is_structure_soluble(ct: Structure) -> bool:
    """
    Return True if the structure has been marked as soluble, False otherwise.

    :param ct: Structure to check if marked as soluble.

    """
    return ct.property.get(constants.FEP_SOLUBILITY) == SOLUBLE_MARKER


[docs]def is_soluble(exposures: List[float]) -> bool:
    """
    If more than 10% of the solute molecules have a exposure greater than 90.0,
    flag the molecule as soluble (< 5 kcal/mol transfer energy).

    :param exposures: List of exposure for each molecule in the structure.

    :return: True if this molecule is considered soluble, False otherwise.

    """
    exposed_mols = len([1 for score in exposures if score > 90.0])
    percent_exposed = float(exposed_mols) / float(len(exposures)) * 100.0
    return percent_exposed > 10.0


[docs]def get_molecule_exposures(baseline_ct: Structure,
                           exposure_ct: Structure,
                           delete_h_atoms=True) -> List[float]:
    """
    Calculates the exposure of each molecule in a structure
    based on SASA and returns a list of exposure of each molecule
    by index of each molecule in the structure - 1 (since structure.molecule is a
    1-indexed list). By default this functiion deletes all H atoms from the
    specified structure. To retain H atoms set delete_h_atoms to False.

    :param baseline_ct: Structure of an individual molecule used as the reference.

    :param exposure_ct: Structure used for the list of molecule exposures.

    :param delete_h_atoms: Set to True to delete all H atoms from
                           the structure when calculating SASA.

    :return: List of exposure for each molecule in the structure.

    """
    exposures = []
    exposure_ct = exposure_ct.copy()
    baseline_ct = baseline_ct.copy()
    if delete_h_atoms:
        for ct in [exposure_ct, baseline_ct]:
            ct.deleteAtoms(analyze.evaluate_asl(ct, "atom.ele H"))
    for mol in exposure_ct.molecule:
        score = 0.0
        mol_sasa = analyze.calculate_sasa(exposure_ct, mol.atom)
        rel_mol_sasa = mol_sasa / len(mol.atom)
        for atom in mol.atom:
            # Get SASA of original exposure_ct atom for comparison
            baseline_atom_idx = atom.index % baseline_ct.atom_total or baseline_ct.atom_total
            baseline_atom = baseline_ct.atom[baseline_atom_idx]
            baseline_sasa = analyze.calculate_sasa(baseline_ct, [baseline_atom])
            if rel_mol_sasa < OBVIOUS_SASA_CUTOFF:
                atom_sasa = 0.0
            else:
                atom_sasa = analyze.calculate_sasa(exposure_ct, [atom])
            if atom_sasa > baseline_sasa:
                score += 1.0
        score = 100.0 * (score / float(len(mol.atom)))
        exposures.append(score)
    return exposures


[docs]def extract_most_exposed_solute_molecules(solute_ct: Structure,
                                          exposures: List[float],
                                          extract_count=5) -> List[Structure]:
    """
    Takes the solute component of an input system and
    returns the `extract_count` most exposed molecules
    ordered by exposure.

    If less than `extract_count` solute molecules are present,
    the returned list contains all solute molecules sorted by exposure.

    :param solute_ct: Input ct file containing the system to process.

    :param exposures: Input ct file containing the system to process.

    :param extract_count: The number of molecules to extract.

    :return: List of `extract_count` most exposed solute mols.

    """
    ranked = sorted(exposures, reverse=True)[:extract_count]
    first_st = True
    solute_cts = []
    for score in ranked:
        score_idx = exposures.index(score)
        # remove score from list in case there are duplicates
        exposures[score_idx] = None
        mol_idx = score_idx + 1
        mol_st = copy.copy(solute_ct)
        for imol, mol in enumerate(mol_st.molecule, start=1):
            if imol == mol_idx:
                ffio_val = 1
            else:
                ffio_val = 0
            for atom in mol.atom:
                atom.property[constants.FEP_ABSOLUTE_ENERGY] = ffio_val
                atom.property[constants.FEP_ABSOLUTE_LIGAND] = ffio_val

        solute_cts.append(mol_st)
    return solute_cts


[docs]def get_header(max_num_sublimation: int):
    line = (
        "Compound Name,Dissolution free energy,Dissolution free energy err,"
        "Hydration free energy," +
        "Hydration free energy err,Median Sublimation free energy,Median Sublimation free energy err"
    )
    if max_num_sublimation:
        sublime_line = ',' + ",".join(
            f"Sublimation free energy {i+1},Sublimation free energy {i+1} err"
            for i in range(max_num_sublimation))
        line += sublime_line
    line += '\n'
    return line


[docs]def get_header_solvation(max_num_solvation: int):
    line = (
        "Compound Name,Mean Solvation free energy,Mean Solvation free energy "
        "lower bound,Mean Solvation free energy upper bound")
    if max_num_solvation:
        line += ',' + ','.join(
            f"Solvation free energy {i+1}, Solvation free energy {i+1} err"
            for i in range(max_num_solvation))
    line += '\n'
    return line


[docs]def get_report(title: str,
               hydration_dg: Optional[Measurement] = None,
               sublimation_dgs: Optional[List[Measurement]] = None,
               solvation_dgs: Optional[List[Measurement]] = None,
               is_soluble: bool = False) -> str:
    """
    Returns the formatted results as csv file contents.

    :param title: Title of the ligand.

    :param hydration_dg: Hydration transfer free energy measurement.
                         Must be specified if is_soluble is False.

    :param sublimation_dgs: List of sublimation transfer free energy
                            measurements.
                            Must be specified if is_soluble is False.

    :param solvation_dgs: List of solvation transfer free energy
                            measurements.

    :param is_soluble: Set to True if this ligand
                       was determined to be soluble in the MD stage.

    """
    from itertools import chain

    if is_soluble:
        line = f"{title},{SOLUBLE_MARKER}\n"
    elif solvation_dgs:
        bs_solvation_dg = bootstrapped_solvation_free_energy(solvation_dgs)
        bs_str = f'{bs_solvation_dg.val},{bs_solvation_dg.lower_bound},{bs_solvation_dg.upper_bound},'
        results = [f'{dg.val},{dg.unc}' for dg in solvation_dgs]
        line = (f"{title}," + bs_str + ",".join(results) + "\n")
    else:
        if sublimation_dgs:
            median_sublimation_dg = median_sublimation_free_energy(
                sublimation_dgs)
            dissolution_dg = calculate_solubility_free_energy(
                hydration_dg, median_sublimation_dg)
        else:
            median_sublimation_dg = 'N/A+-N/A'
            dissolution_dg = 'N/A+-N/A'
        line = (f"{title}," + ",".join(
            f"{str(d).replace('+-', ',')}"
            for d in chain((dissolution_dg, hydration_dg,
                            median_sublimation_dg), sublimation_dgs)) + "\n")
    return line


[docs]def parse_report(csv_contents: str) -> ParsedReport:
    """
    Parse a report csv and return a `ParsedReport` mapping the ligand to the result.
    Each result is a dictionary where the keys are the columns from the csv.
    """
    data = {}
    f = StringIO(csv_contents)
    for row in csv.DictReader(f):
        parsed_row = {}
        for k, v in row.items():
            if k is not None:
                k = k.strip()
            if v is not None:
                v = v.strip()
            if k == COMPOUND_NAME:
                continue
            if v == 'N/A':
                v = None
            if v is not None and v != SOLUBLE_MARKER:
                v = float(v)
            parsed_row[k] = v
        data[row[COMPOUND_NAME]] = parsed_row
    return data


[docs]def calculate_report_rmse(ref_data: ParsedReport,
                          test_data: ParsedReport) -> Dict[str, float]:
    """
    Calculate the rsme between the parsed reference data and the test data.
    Missing data is treated as infinite difference

    Used in STU and SCIVAL tests.

    :return: Dictionary where the keys are the column names and the values
        are the rmse for that column.
    """
    from schrodinger.application.scisol.packages.fep import fep_stats
    result = {}
    for col in list(ref_data.values())[0].keys():
        if col == COMPOUND_NAME:
            continue
        ref_array = []
        test_array = []
        for compound in ref_data.keys():
            ref_val = ref_data[compound][col]

            # Handle soluble compounds
            if ref_val == SOLUBLE_MARKER or ref_val is None:
                ref_array.append(0.0)
                # If both marked soluble, set as 0 difference
                # otherwise set as inf difference.
                if test_data[compound][col] == ref_val:
                    test_array.append(0.0)
                else:
                    test_array.append(float('inf'))
                continue

            # Not marked soluble
            ref_array.append(float(ref_val))
            if col in test_data[compound]:
                val = test_data[compound][col]
                # Should not be marked soluble, if it is
                # then give inf difference.
                if val == SOLUBLE_MARKER or val is None:
                    test_array.append(float('inf'))
                else:
                    test_array.append(float(val))
            else:
                # Treat missing data as inf difference
                test_array.append(float('inf'))

        _, rmse = fep_stats.calc_stats(
            np.abs(np.array(ref_array) - np.array(test_array)))
        result[col] = rmse.val
    return result


[docs]def parse_output_structure_properties(sts: List[Structure]) -> ParsedReport:
    """
    Parse the ct level solubility properties from report structures and return the
    dictionary of {compound: {property: value}}. Missing values are set to None.

    Used in STU and SCIVAL tests.
    """
    FEP_SOLUBILITY_ERR = f'{constants.FEP_SOLUBILITY} err'
    result = defaultdict(dict)
    for st in sts:
        dg_dissolution = st.property.get(constants.FEP_SOLUBILITY)
        if dg_dissolution == SOLUBLE_MARKER:
            result[st.title][constants.FEP_SOLUBILITY] = SOLUBLE_MARKER
            result[st.title][FEP_SOLUBILITY_ERR] = None
            result[st.title][constants.FEP_SOLUBILITY_MICROMOLAR] = None
            continue
        elif dg_dissolution is not None:
            dg_dissolution = Measurement.from_string(dg_dissolution)
            result[st.title][constants.FEP_SOLUBILITY] = dg_dissolution.val
            result[st.title][FEP_SOLUBILITY_ERR] = dg_dissolution.unc
        else:
            result[st.title][constants.FEP_SOLUBILITY] = None
            result[st.title][FEP_SOLUBILITY_ERR] = None

        result[st.title][constants.FEP_SOLUBILITY_MICROMOLAR] = st.property.get(
            constants.FEP_SOLUBILITY_MICROMOLAR)

    return result


[docs]def bootstrapped_solvation_free_energy(
        solvation_dgs: List[Measurement]) -> ConfidenceInterval:
    data = np.array([dg.val for dg in solvation_dgs])
    result = bs.bootstrap(data, stat_func=bs_stats.mean, is_pivotal=False)
    return ConfidenceInterval(result.value, result.lower_bound,
                              result.upper_bound)