Source code for schrodinger.analysis.enrichment.enrichment_input

"""
Input file parser for enrichment module.

For most virtual screen result input formats, titles are used to identify the
ligands. The input is expected to be correctly ordered. If it is not ordered,
please set the optional parameter sort_header in parser functions to the
correct score header/property. If the file contains duplicate titles then only
the first occurrence of a unique title is ranked.

Input file formats::

    <actives_file>
        Text file.
            Raw text, one title per line.
        Structure file.
            A file containing structures with a meaningful title.
        CSV file.
            A comma-separated values file.
        List(str).
            A list of active string titles.
    <results_file>
        Structure file, e.g. 'foo_pv.mae'
            A file containing ordered structures.
        CSV file.
            A comma-separated values file containing ranked titles ordered by
            virtual screen scoring metric.
        List(str) or List(structure).
            A list of ranked titles ordered by virtual screen scoring metric.

API examples::

    # Ex. 1) Calculate BEDROC
    active_titles = extract_active_titles_from_txt(actives_file)
    total_actives, total_ligands, active_ranks, adjusted_active_ranks,
        total_ranked, title_ranks = extract_ranks_from_mae(
            mae_file_name="screen_results.maegz",
            active_titles=active_titles,
            num_decoy=1000)
    bedroc, bedroc_ra = metrics.calcBEDROC(total_actives, total_ligands,
                                           active_ranks, 20.0)

    # Ex. 2) Using the reporter class to calculate the default set of metrics.
             Note that this is not a good practice.
    r = reporter.EnrichmentReporter(
        actives_file="my_actives.txt",
        results_file="screen_results.maegz",
        num_decoy=1000)
    r.report()

Copyright Schrodinger, LLC. All rights reserved.
"""

import csv
import itertools

import schrodinger.application.canvas.fingerprint as canvas_fp
import schrodinger.application.canvas.similarity as canvas_sim
import schrodinger.structure as structure
import schrodinger.utils.log as log
from schrodinger.application.glide.utils import is_valid_pv_file
from schrodinger.utils import csv_unicode

CSV_FILE_EXT = ('.csv', '.CSV')
CSV_TITLE_HEADER = "Title"
ST_TITLE_HEADER = "s_m_title"

logger = log.get_output_logger(__file__)


[docs]class FingerprintComponent:
    """
    Data class that contains critical objects that all fingerprint-related
    metrics functions (calc_DEF, calc_DEFStar and calc_DEFP) need.

    :cvar fp_gen: Object needed to generate fingerprint for each active title.
    :vartype fp_gen: CanvasFingerprintGenerator

    :cvar fp_sim: Object needed to compare fingerprint similarity for each
                  active pair.
    :vartype fp_sim: CanvasFingerprintSimilarity

    :cvar active_fingerprint: Title keys for fingerprint. Not available for
                              screen results that don't include title and
                              structure information.
    :vartype active_fingerprint: dict

    :cvar min_Tc_total_actives: A float representing the lowest Tc, Tanimoto
                                coefficient, of all the active similarity
                                pairs.
    :vartype min_Tc_total_actives: float
    """

[docs]    def __init__(self, fp_gen, fp_sim, active_fingerprint,
                 min_Tc_total_actives):
        self.fp_gen = fp_gen
        self.fp_sim = fp_sim
        self.active_fingerprint = active_fingerprint
        self.min_Tc_total_actives = min_Tc_total_actives


###############################################################################
# Public Functions
###############################################################################


[docs]def extract_active_titles_from_csv(actives_file):
    """
    Parse actives_file as a csv file, return distinct active titles.
    Repeated active titles are ignored.

    :param actives_file: A csv file containing all active titles.
    :type actives_file: str

    :return: Distinct active titles from the actives file.
    :rtype: set(str)
    """
    with csv_unicode.reader_open(actives_file) as fh:
        active_titles = set(row[CSV_TITLE_HEADER] for row in csv.DictReader(fh))
    return active_titles


[docs]def extract_active_titles_from_mae(actives_file):
    """
    Parse actives_file as a maestro file, return distinct active titles.
    Repeated active titles are ignored.

    :param actives_file: A maestro file containing all active titles.
    :type actives_file: str

    :return: Distinct active titles from the actives file.
    :rtype: set(str)
    """
    active_titles = set(
        st.title for st in structure.StructureReader(actives_file))
    return active_titles


[docs]def extract_active_titles_from_txt(actives_file):
    """
    Parse actives_file as a raw text file with one title per line,
    return distinct active titles from the actives file.
    Repeated active titles are ignored.

    :param actives_file: Raw text file containing one title per line.
    :type actives_file: str

    :return: Distinct active titles from the actives file.
    :rtype: set(str)
    """
    with open(actives_file, 'r') as fh:
        active_titles = set(line.strip('\n') for line in fh)
    return active_titles


[docs]def extract_active_titles_from_list(actives):
    """
    Parse actives from list of string, return distinct active titles from the
    list. Repeated active titles are ignored.

    :param actives: A list of strings containing all active titles.
    :type actives: list(str)

    :return: Distinct active titles from the actives file.
    :rtype: set(str)
    """
    return set(title for title in actives)


[docs]def extract_ranks_from_list(titles_iter, active_titles, num_decoy=0):
    """
    Compute and return rank and count related terms from a list of ligand titles
    pre-sorted by virtual screen scoring metric.

    :param titles_iter: A list of title strings, pre-sorted by virtual screen
                        scoring metric.
    :type titles_iter: list(str)

    :param active_titles: Distinct active titles from the actives file
    :type active_titles: set(str)

    :param num_decoy: The total number of decoys. If specified, the total
                      number of ligands will be distinct active titles from
                      actives file + num_decoy. This will enable the calculation
                      of the correction term in calc_AUAC, should the total
                      number of ligands not equal to the total number of ranked
                      titles in results_file.
    :type num_decoy: int

    :return: A tuple containing total number of active titles, total number
             of ligand titles, active ranks, adjusted active ranks, total
             number of ranked titles, and a dictionary storing active titles
             as keys and their ranks as value.
    :rtype: int, int, list(int), list(int), int, dict(str, int)

    """
    # Bind initial values
    title_ranks = {}
    hits_titles = set()

    # Parse the file
    for i, title in enumerate(titles_iter):
        _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles,
                                              title_ranks)
    total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \
        _calc_rank_terms(active_titles, hits_titles, title_ranks)
    total_actives, total_ligands = \
        _calc_counts(active_titles, active_ranks, num_decoy, total_ranked)

    return (total_actives, total_ligands, active_ranks, adjusted_active_ranks,
            total_ranked, title_ranks)


[docs]def extract_ranks_from_csv(csv_file_name,
                           active_titles,
                           num_decoy=0,
                           id_header=CSV_TITLE_HEADER):
    """
    Compute and return rank and count related terms from a csv file.

    :param csv_file_name: File name of the csv file that contains the virtual
                          screening result.
    :type csv_file_name: str

    :param active_titles: Distinct active titles from the actives file
    :type active_titles: set(str)

    :param num_decoy: The total number of decoys. If specified, the total
                      number of ligands will be distinct active titles from
                      actives file + num_decoy. This will enable the calculation
                      of the correction term in calc_AUAC, should the total
                      number of ligands not equal to the total number of ranked
                      titles in results_file.
    :type num_decoy: int

    :param id_header: Name of compound-identifying header.
    :type id_header: str

    :return: A tuple containing total number of active titles, total number
             of ligand titles, active ranks, adjusted active ranks, total
             number of ranked titles, and a dictionary storing active titles
             as keys and their ranks as value.
    :rtype: int, int, list(int), list(int), int, dict(str, int)

    """
    # Bind initial values
    title_ranks = {}
    hits_titles = set()

    with csv_unicode.reader_open(csv_file_name) as fh:
        for row in csv.DictReader(fh):
            title = row[id_header]
            _set_title_to_hits_titles_title_ranks(title, active_titles,
                                                  hits_titles, title_ranks)
    total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \
        _calc_rank_terms(active_titles, hits_titles, title_ranks)
    total_actives, total_ligands = \
        _calc_counts(active_titles, active_ranks, num_decoy, total_ranked)

    return (total_actives, total_ligands, active_ranks, adjusted_active_ranks,
            total_ranked, title_ranks)


[docs]def extract_ranks_from_structures(structure_iter,
                                  active_titles,
                                  num_decoy=0,
                                  id_property=ST_TITLE_HEADER):
    """
    Compute and return rank and count related terms from a list of structures.

    :param structure_iter: A list of structure.Structure.
    :type structure_iter: list(structure.Structure)

    :param active_titles: Distinct active titles from the actives file
    :type active_titles: set(str)

    :param num_decoy: The total number of decoys. If specified, the total
                      number of ligands will be distinct active titles from
                      actives file + num_decoy. This will enable the calculation
                      of the correction term in calc_AUAC, should the total
                      number of ligands not equal to the total number of ranked
                      titles in results_file.
    :type num_decoy: int

    :param id_property: Name of compound-identifying property.
    :type id_property: str

    :return: A tuple containing total number of active titles, total number
             of ligand titles, active ranks, adjusted active ranks, total
             number of ranked titles, and a dictionary storing active titles
             as keys and their ranks as value.
    :rtype: int, int, list(int), list(int), int, dict(str, int)

    """
    # Bind initial values
    title_ranks = {}
    hits_titles = set()

    for st in structure_iter:
        title = st.property[id_property]
        _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles,
                                              title_ranks)
    total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \
        _calc_rank_terms(active_titles, hits_titles, title_ranks)
    total_actives, total_ligands = \
        _calc_counts(active_titles, active_ranks, num_decoy, total_ranked)

    return (total_actives, total_ligands, active_ranks, adjusted_active_ranks,
            total_ranked, title_ranks)


[docs]def extract_ranks_from_mae(mae_file_name,
                           active_titles,
                           num_decoy=0,
                           id_property=ST_TITLE_HEADER):
    """
    Compute and return rank and count related terms from a structure file.

    :param mae_file_name: A structure file that contains the virtual
                          screening result.
    :type mae_file_name: str

    :param active_titles: Distinct active titles from the actives file
    :type active_titles: set(str)

    :param num_decoy: The total number of decoys. If specified, the total
                      number of ligands will be distinct active titles from
                      actives file + num_decoy. This will enable the calculation
                      of the correction term in calc_AUAC, should the total
                      number of ligands not equal to the total number of ranked
                      titles in results_file.
    :type num_decoy: int

    :param id_property: Name of compound-identifying property.
    :type id_property: str

    :return: A tuple containing total number of active titles, total number
             of ligand titles, active ranks, adjusted active ranks, total
             number of ranked titles, and a dictionary storing active titles
             as keys and their ranks as value.
    :rtype: int, int, list(int), list(int), int, dict(str, int)

    """
    # Bind initial values
    title_ranks = {}
    hits_titles = set()

    # Assume Glide results are typical, skip the receptor in pv files
    file_index = 2 if is_valid_pv_file(mae_file_name) else 1

    for st in structure.StructureReader(mae_file_name, file_index):
        title = st.property[id_property]
        _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles,
                                              title_ranks)
    total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \
        _calc_rank_terms(active_titles, hits_titles, title_ranks)
    total_actives, total_ligands = \
        _calc_counts(active_titles, active_ranks, num_decoy, total_ranked)

    return (total_actives, total_ligands, active_ranks, adjusted_active_ranks,
            total_ranked, title_ranks)


[docs]def get_fingerprint_components(structure_file,
                               active_titles,
                               id_property=ST_TITLE_HEADER):
    """
    Initialize and return a data class object needed for fingerprint-related
    calculations.

    :param structure_file: Structure file or a list of structures.
    :type structure_file: str or list(str)

    :param active_titles: Distinct active titles from the actives file
    :type active_titles: set(str)

    :param id_property: Name of compound-identifying property.
    :type id_property: str

    :return: The initialized enrichment_input.FingerprintComponent object.
    :rtype: enrichment_input.FingerprintComponent

    """

    fp_gen = canvas_fp.CanvasFingerprintGenerator(logger=logger)
    fp_gen.setType('Linear')
    fp_gen.setAtomBondTyping(fp_gen.getDefaultAtomTypingScheme())
    fp_sim = canvas_sim.CanvasFingerprintSimilarity(logger=logger)
    fp_sim.setMetric('Tanimoto')

    # Bind initial values
    title_ranks = {}
    hits_titles = set()
    active_fingerprint = {}

    if isinstance(structure_file, str):
        # Assume Glide results are typical, skip the receptor in pv files
        file_index = 2 if is_valid_pv_file(structure_file) else 1
        for st in structure.StructureReader(structure_file, file_index):
            title = st.property[id_property]
            _fingerprint_gen(st, title, active_titles, fp_gen, hits_titles,
                             title_ranks, active_fingerprint)
    elif hasattr(structure_file, '__iter__'):
        for st_index, st in enumerate(structure_file):
            title = st.property[id_property]
            _fingerprint_gen(st, title, active_titles, fp_gen, hits_titles,
                             title_ranks, active_fingerprint)
    else:
        msg = "Input file should be a structure file or a list of structures."
        raise TypeError(msg)

    min_tc = _calc_min_tc_total_actives(fp_sim, active_fingerprint)

    return FingerprintComponent(fp_gen, fp_sim, active_fingerprint, min_tc)


##############################################################################
# Protected Functions
###############################################################################


def _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles,
                                          title_ranks):
    """
    Helper function for parsers to set the correct rank and title to
    hits_titles and title_ranks.

    :param title: Ligand name.
    :type title: str

    :param active_titles: Distinct active titles parsed from the actives file.
    :type active_titles: set(str)

    :param hits_titles: Distinct titles parsed from the results file.
    :type hits_titles: set(str)

    :param title_ranks: A dictionary storing active titles as keys and their
                        ranks as value.
    :type title_ranks: dict(str, int)

    """

    if title not in hits_titles:
        hits_titles.add(title)
        if title in active_titles:
            rank = len(hits_titles)
            title_ranks[rank] = title


def _fingerprint_gen(st, title, active_titles, fp_gen, hits_titles, title_ranks,
                     active_fingerprint):
    """
    Helper function for parsers to set the correct rank and title to
    hits_titles and title_ranks. Also generate structure fingerprint while
    processing each title.

    :param st: Ligand structure.
    :type st: structure.Structure

    :param title: Ligand name.
    :type title: str

    :param active_titles: Distinct active titles parsed from the actives file.
    :type active_titles: set(str)

    :param fp_gen: canvas_fp.CanvasFingerprintGenerator object.
    :type fp_gen: canvas_fp.CanvasFingerprintGenerator

    :param hits_titles: Distinct titles parsed from the results file.
    :type hits_titles: set(str)

    :param title_ranks: A dictionary storing active titles as keys and their
                        ranks as value.
    :type title_ranks: dict(str, int)

    :param active_fingerprint: Aictionary with active title as key, the
                               corresponding structure's fingerprint as value.
    :type active_fingerprint: dict(str, str)

    """

    if title not in hits_titles:
        hits_titles.add(title)
        if title in active_titles:
            rank = len(hits_titles)
            title_ranks[rank] = title
            if title in active_titles:
                active_fingerprint[title] = fp_gen.generate(st)


def _calc_rank_terms(active_titles, hits_titles, title_ranks):
    """
    Helper function for parsers to calculate and return rank related terms.

    :param active_titles: Distinct active titles parsed from the actives file.
    :type active_titles: set(str)

    :param hits_titles: Distinct titles parsed from the results file.
    :type hits_titles: set(str)

    :param title_ranks: A dictionary storing active titles as keys and their
                        ranks as value.
    :type title_ranks: dict(str, int)

    :return: A tuple of total number of ranked titles, active ranks, adjusted
             active ranks and the missing active titles.
    :rtype: (int, list(int), list(int), list(str))
    """

    total_ranked = len(hits_titles)
    active_ranks = sorted(list(title_ranks))
    adjusted_active_ranks = [
        (rank - index) for index, rank in enumerate(active_ranks)
    ]
    missing_active_titles = [x for x in active_titles if x not in hits_titles]
    if missing_active_titles:
        logger.warning(f"Missing active titles found {missing_active_titles}")

    return (total_ranked, active_ranks, adjusted_active_ranks,
            missing_active_titles)


def _calc_counts(active_titles, active_ranks, num_decoy, total_ranked):
    """
    Helper function for parsers to calculate and return total_actives and
    total_ligands.

    NOTE: this function is unnecessary if unranked actives and unranked decoys
    are not something we should consider.

    :param active_titles: Distinct active titles parsed from the actives file.
    :type active_titles: set(str)

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param num_decoy: The total number of decoys. If specified, the total
                      number of ligands will be distinct active titles from
                      actives file + num_decoy. This will enable the calculation
                      of the correction term in calc_AUAC, should the total
                      number of ligands not equal to the total number of ranked
                      titles in results_file.
    :type num_decoy: int

    :param total_ranked: The total number of ligands ranked by the virtual
                         screen scoring metric.
    :type total_ranked: int

    :return: A tuple of total number of ranked actives and total number of
             ligands (ranked/unranked)
    :rtype: (int, int)

    """

    total_actives_from_actives = len(active_titles)
    total_actives_from_results = len(active_ranks)

    if not num_decoy:
        total_actives = total_actives_from_results
        total_ligands = total_ranked
    else:
        total_actives = total_actives_from_actives
        total_ligands = total_actives + num_decoy

    return (total_actives, total_ligands)


def _calc_min_tc_total_actives(fp_sim, active_fingerprint):
    """
    Calculate min_Tc_total_actives, a float representing the lowest
    Tc, Tanimoto coefficient, of all the active similarity pairs.
    Tc is scaled from 0.0-1.0, where a 1.0 indicates a high degree
    of similarity. O(N^2) complexity, N = total number of actives.

    :param fp_sim: canvas_sim.CanvasFingerprintSimilarity instance
    :type fp_sim: canvas_sim.CanvasFingerprintSimilarity

    :param active_fingerprint: Dictionary with active title as key, the
                               corresponding structure's fingerprint as value
    :type active_fingerprint: dict(str, str)

    :return: Lowest Tanimoto coefficient of all the active structure pairs
    :rtype: float

    """

    # The highest possible value for minTc is 1
    min_Tc_total_actives = 1.0

    if not active_fingerprint:
        msg = "_calc_min_tc_total_actives: missing fingerprints. minTc set to 1"
        logger.debug(msg)
        return min_Tc_total_actives

    for i_title, j_title in itertools.combinations(active_fingerprint.keys(),
                                                   2):
        sim = fp_sim.calculateSimilarity(active_fingerprint[i_title],
                                         active_fingerprint[j_title])
        if sim < min_Tc_total_actives:
            min_Tc_total_actives = sim

    return min_Tc_total_actives