Source code for schrodinger.analysis.enrichment.enrichment_input

"""
Input file parser for enrichment module.

For most virtual screen result input formats, titles are used to identify the
ligands. The input is expected to be correctly ordered. If it is not ordered,
please set the optional parameter sort_header in parser functions to the
correct score header/property. If the file contains duplicate titles then only
the first occurrence of a unique title is ranked.

Input file formats::

    <actives_file>
        Text file.
            Raw text, one title per line.
        Structure file.
            A file containing structures with a meaningful title.
        CSV file.
            A comma-separated values file.
        List(str).
            A list of active string titles.
    <results_file>
        Structure file, e.g. 'foo_pv.mae'
            A file containing ordered structures.
        CSV file.
            A comma-separated values file containing ranked titles ordered by
            virtual screen scoring metric.
        List(str) or List(structure).
            A list of ranked titles ordered by virtual screen scoring metric.

API examples::

    # Ex. 1) Calculate BEDROC
    active_titles = extract_active_titles_from_txt(actives_file)
    total_actives, total_ligands, active_ranks, adjusted_active_ranks,
        total_ranked, title_ranks = extract_ranks_from_mae(
            mae_file_name="screen_results.maegz",
            active_titles=active_titles,
            num_decoy=1000)
    bedroc, bedroc_ra = metrics.calcBEDROC(total_actives, total_ligands,
                                           active_ranks, 20.0)

    # Ex. 2) Using the reporter class to calculate the default set of metrics.
             Note that this is not a good practice.
    r = reporter.EnrichmentReporter(
        actives_file="my_actives.txt",
        results_file="screen_results.maegz",
        num_decoy=1000)
    r.report()

Copyright Schrodinger, LLC. All rights reserved.
"""

import csv
import itertools

import schrodinger.application.canvas.fingerprint as canvas_fp
import schrodinger.application.canvas.similarity as canvas_sim
import schrodinger.structure as structure
import schrodinger.utils.log as log
from schrodinger.application.glide.utils import is_valid_pv_file
from schrodinger.utils import csv_unicode

CSV_FILE_EXT = ('.csv', '.CSV')
CSV_TITLE_HEADER = "Title"
ST_TITLE_HEADER = "s_m_title"

logger = log.get_output_logger(__file__)


[docs]class FingerprintComponent: """ Data class that contains critical objects that all fingerprint-related metrics functions (calc_DEF, calc_DEFStar and calc_DEFP) need. :cvar fp_gen: Object needed to generate fingerprint for each active title. :vartype fp_gen: CanvasFingerprintGenerator :cvar fp_sim: Object needed to compare fingerprint similarity for each active pair. :vartype fp_sim: CanvasFingerprintSimilarity :cvar active_fingerprint: Title keys for fingerprint. Not available for screen results that don't include title and structure information. :vartype active_fingerprint: dict :cvar min_Tc_total_actives: A float representing the lowest Tc, Tanimoto coefficient, of all the active similarity pairs. :vartype min_Tc_total_actives: float """
[docs] def __init__(self, fp_gen, fp_sim, active_fingerprint, min_Tc_total_actives): self.fp_gen = fp_gen self.fp_sim = fp_sim self.active_fingerprint = active_fingerprint self.min_Tc_total_actives = min_Tc_total_actives
############################################################################### # Public Functions ###############################################################################
[docs]def extract_active_titles_from_csv(actives_file): """ Parse actives_file as a csv file, return distinct active titles. Repeated active titles are ignored. :param actives_file: A csv file containing all active titles. :type actives_file: str :return: Distinct active titles from the actives file. :rtype: set(str) """ with csv_unicode.reader_open(actives_file) as fh: active_titles = set(row[CSV_TITLE_HEADER] for row in csv.DictReader(fh)) return active_titles
[docs]def extract_active_titles_from_mae(actives_file): """ Parse actives_file as a maestro file, return distinct active titles. Repeated active titles are ignored. :param actives_file: A maestro file containing all active titles. :type actives_file: str :return: Distinct active titles from the actives file. :rtype: set(str) """ active_titles = set( st.title for st in structure.StructureReader(actives_file)) return active_titles
[docs]def extract_active_titles_from_txt(actives_file): """ Parse actives_file as a raw text file with one title per line, return distinct active titles from the actives file. Repeated active titles are ignored. :param actives_file: Raw text file containing one title per line. :type actives_file: str :return: Distinct active titles from the actives file. :rtype: set(str) """ with open(actives_file, 'r') as fh: active_titles = set(line.strip('\n') for line in fh) return active_titles
[docs]def extract_active_titles_from_list(actives): """ Parse actives from list of string, return distinct active titles from the list. Repeated active titles are ignored. :param actives: A list of strings containing all active titles. :type actives: list(str) :return: Distinct active titles from the actives file. :rtype: set(str) """ return set(title for title in actives)
[docs]def extract_ranks_from_list(titles_iter, active_titles, num_decoy=0): """ Compute and return rank and count related terms from a list of ligand titles pre-sorted by virtual screen scoring metric. :param titles_iter: A list of title strings, pre-sorted by virtual screen scoring metric. :type titles_iter: list(str) :param active_titles: Distinct active titles from the actives file :type active_titles: set(str) :param num_decoy: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type num_decoy: int :return: A tuple containing total number of active titles, total number of ligand titles, active ranks, adjusted active ranks, total number of ranked titles, and a dictionary storing active titles as keys and their ranks as value. :rtype: int, int, list(int), list(int), int, dict(str, int) """ # Bind initial values title_ranks = {} hits_titles = set() # Parse the file for i, title in enumerate(titles_iter): _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles, title_ranks) total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \ _calc_rank_terms(active_titles, hits_titles, title_ranks) total_actives, total_ligands = \ _calc_counts(active_titles, active_ranks, num_decoy, total_ranked) return (total_actives, total_ligands, active_ranks, adjusted_active_ranks, total_ranked, title_ranks)
[docs]def extract_ranks_from_csv(csv_file_name, active_titles, num_decoy=0, id_header=CSV_TITLE_HEADER): """ Compute and return rank and count related terms from a csv file. :param csv_file_name: File name of the csv file that contains the virtual screening result. :type csv_file_name: str :param active_titles: Distinct active titles from the actives file :type active_titles: set(str) :param num_decoy: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type num_decoy: int :param id_header: Name of compound-identifying header. :type id_header: str :return: A tuple containing total number of active titles, total number of ligand titles, active ranks, adjusted active ranks, total number of ranked titles, and a dictionary storing active titles as keys and their ranks as value. :rtype: int, int, list(int), list(int), int, dict(str, int) """ # Bind initial values title_ranks = {} hits_titles = set() with csv_unicode.reader_open(csv_file_name) as fh: for row in csv.DictReader(fh): title = row[id_header] _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles, title_ranks) total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \ _calc_rank_terms(active_titles, hits_titles, title_ranks) total_actives, total_ligands = \ _calc_counts(active_titles, active_ranks, num_decoy, total_ranked) return (total_actives, total_ligands, active_ranks, adjusted_active_ranks, total_ranked, title_ranks)
[docs]def extract_ranks_from_structures(structure_iter, active_titles, num_decoy=0, id_property=ST_TITLE_HEADER): """ Compute and return rank and count related terms from a list of structures. :param structure_iter: A list of structure.Structure. :type structure_iter: list(structure.Structure) :param active_titles: Distinct active titles from the actives file :type active_titles: set(str) :param num_decoy: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type num_decoy: int :param id_property: Name of compound-identifying property. :type id_property: str :return: A tuple containing total number of active titles, total number of ligand titles, active ranks, adjusted active ranks, total number of ranked titles, and a dictionary storing active titles as keys and their ranks as value. :rtype: int, int, list(int), list(int), int, dict(str, int) """ # Bind initial values title_ranks = {} hits_titles = set() for st in structure_iter: title = st.property[id_property] _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles, title_ranks) total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \ _calc_rank_terms(active_titles, hits_titles, title_ranks) total_actives, total_ligands = \ _calc_counts(active_titles, active_ranks, num_decoy, total_ranked) return (total_actives, total_ligands, active_ranks, adjusted_active_ranks, total_ranked, title_ranks)
[docs]def extract_ranks_from_mae(mae_file_name, active_titles, num_decoy=0, id_property=ST_TITLE_HEADER): """ Compute and return rank and count related terms from a structure file. :param mae_file_name: A structure file that contains the virtual screening result. :type mae_file_name: str :param active_titles: Distinct active titles from the actives file :type active_titles: set(str) :param num_decoy: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type num_decoy: int :param id_property: Name of compound-identifying property. :type id_property: str :return: A tuple containing total number of active titles, total number of ligand titles, active ranks, adjusted active ranks, total number of ranked titles, and a dictionary storing active titles as keys and their ranks as value. :rtype: int, int, list(int), list(int), int, dict(str, int) """ # Bind initial values title_ranks = {} hits_titles = set() # Assume Glide results are typical, skip the receptor in pv files file_index = 2 if is_valid_pv_file(mae_file_name) else 1 for st in structure.StructureReader(mae_file_name, file_index): title = st.property[id_property] _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles, title_ranks) total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles = \ _calc_rank_terms(active_titles, hits_titles, title_ranks) total_actives, total_ligands = \ _calc_counts(active_titles, active_ranks, num_decoy, total_ranked) return (total_actives, total_ligands, active_ranks, adjusted_active_ranks, total_ranked, title_ranks)
[docs]def get_fingerprint_components(structure_file, active_titles, id_property=ST_TITLE_HEADER): """ Initialize and return a data class object needed for fingerprint-related calculations. :param structure_file: Structure file or a list of structures. :type structure_file: str or list(str) :param active_titles: Distinct active titles from the actives file :type active_titles: set(str) :param id_property: Name of compound-identifying property. :type id_property: str :return: The initialized enrichment_input.FingerprintComponent object. :rtype: enrichment_input.FingerprintComponent """ fp_gen = canvas_fp.CanvasFingerprintGenerator(logger=logger) fp_gen.setType('Linear') fp_gen.setAtomBondTyping(fp_gen.getDefaultAtomTypingScheme()) fp_sim = canvas_sim.CanvasFingerprintSimilarity(logger=logger) fp_sim.setMetric('Tanimoto') # Bind initial values title_ranks = {} hits_titles = set() active_fingerprint = {} if isinstance(structure_file, str): # Assume Glide results are typical, skip the receptor in pv files file_index = 2 if is_valid_pv_file(structure_file) else 1 for st in structure.StructureReader(structure_file, file_index): title = st.property[id_property] _fingerprint_gen(st, title, active_titles, fp_gen, hits_titles, title_ranks, active_fingerprint) elif hasattr(structure_file, '__iter__'): for st_index, st in enumerate(structure_file): title = st.property[id_property] _fingerprint_gen(st, title, active_titles, fp_gen, hits_titles, title_ranks, active_fingerprint) else: msg = "Input file should be a structure file or a list of structures." raise TypeError(msg) min_tc = _calc_min_tc_total_actives(fp_sim, active_fingerprint) return FingerprintComponent(fp_gen, fp_sim, active_fingerprint, min_tc)
############################################################################## # Protected Functions ############################################################################### def _set_title_to_hits_titles_title_ranks(title, active_titles, hits_titles, title_ranks): """ Helper function for parsers to set the correct rank and title to hits_titles and title_ranks. :param title: Ligand name. :type title: str :param active_titles: Distinct active titles parsed from the actives file. :type active_titles: set(str) :param hits_titles: Distinct titles parsed from the results file. :type hits_titles: set(str) :param title_ranks: A dictionary storing active titles as keys and their ranks as value. :type title_ranks: dict(str, int) """ if title not in hits_titles: hits_titles.add(title) if title in active_titles: rank = len(hits_titles) title_ranks[rank] = title def _fingerprint_gen(st, title, active_titles, fp_gen, hits_titles, title_ranks, active_fingerprint): """ Helper function for parsers to set the correct rank and title to hits_titles and title_ranks. Also generate structure fingerprint while processing each title. :param st: Ligand structure. :type st: structure.Structure :param title: Ligand name. :type title: str :param active_titles: Distinct active titles parsed from the actives file. :type active_titles: set(str) :param fp_gen: canvas_fp.CanvasFingerprintGenerator object. :type fp_gen: canvas_fp.CanvasFingerprintGenerator :param hits_titles: Distinct titles parsed from the results file. :type hits_titles: set(str) :param title_ranks: A dictionary storing active titles as keys and their ranks as value. :type title_ranks: dict(str, int) :param active_fingerprint: Aictionary with active title as key, the corresponding structure's fingerprint as value. :type active_fingerprint: dict(str, str) """ if title not in hits_titles: hits_titles.add(title) if title in active_titles: rank = len(hits_titles) title_ranks[rank] = title if title in active_titles: active_fingerprint[title] = fp_gen.generate(st) def _calc_rank_terms(active_titles, hits_titles, title_ranks): """ Helper function for parsers to calculate and return rank related terms. :param active_titles: Distinct active titles parsed from the actives file. :type active_titles: set(str) :param hits_titles: Distinct titles parsed from the results file. :type hits_titles: set(str) :param title_ranks: A dictionary storing active titles as keys and their ranks as value. :type title_ranks: dict(str, int) :return: A tuple of total number of ranked titles, active ranks, adjusted active ranks and the missing active titles. :rtype: (int, list(int), list(int), list(str)) """ total_ranked = len(hits_titles) active_ranks = sorted(list(title_ranks)) adjusted_active_ranks = [ (rank - index) for index, rank in enumerate(active_ranks) ] missing_active_titles = [x for x in active_titles if x not in hits_titles] if missing_active_titles: logger.warning(f"Missing active titles found {missing_active_titles}") return (total_ranked, active_ranks, adjusted_active_ranks, missing_active_titles) def _calc_counts(active_titles, active_ranks, num_decoy, total_ranked): """ Helper function for parsers to calculate and return total_actives and total_ligands. NOTE: this function is unnecessary if unranked actives and unranked decoys are not something we should consider. :param active_titles: Distinct active titles parsed from the actives file. :type active_titles: set(str) :param active_ranks: List of *unadjusted* integer ranks for the actives found in the screen. For example, a screen result that placed three actives as the first three ranks has an active_ranks list of = [1, 2, 3]. :type active_ranks: list(int) :param num_decoy: The total number of decoys. If specified, the total number of ligands will be distinct active titles from actives file + num_decoy. This will enable the calculation of the correction term in calc_AUAC, should the total number of ligands not equal to the total number of ranked titles in results_file. :type num_decoy: int :param total_ranked: The total number of ligands ranked by the virtual screen scoring metric. :type total_ranked: int :return: A tuple of total number of ranked actives and total number of ligands (ranked/unranked) :rtype: (int, int) """ total_actives_from_actives = len(active_titles) total_actives_from_results = len(active_ranks) if not num_decoy: total_actives = total_actives_from_results total_ligands = total_ranked else: total_actives = total_actives_from_actives total_ligands = total_actives + num_decoy return (total_actives, total_ligands) def _calc_min_tc_total_actives(fp_sim, active_fingerprint): """ Calculate min_Tc_total_actives, a float representing the lowest Tc, Tanimoto coefficient, of all the active similarity pairs. Tc is scaled from 0.0-1.0, where a 1.0 indicates a high degree of similarity. O(N^2) complexity, N = total number of actives. :param fp_sim: canvas_sim.CanvasFingerprintSimilarity instance :type fp_sim: canvas_sim.CanvasFingerprintSimilarity :param active_fingerprint: Dictionary with active title as key, the corresponding structure's fingerprint as value :type active_fingerprint: dict(str, str) :return: Lowest Tanimoto coefficient of all the active structure pairs :rtype: float """ # The highest possible value for minTc is 1 min_Tc_total_actives = 1.0 if not active_fingerprint: msg = "_calc_min_tc_total_actives: missing fingerprints. minTc set to 1" logger.debug(msg) return min_Tc_total_actives for i_title, j_title in itertools.combinations(active_fingerprint.keys(), 2): sim = fp_sim.calculateSimilarity(active_fingerprint[i_title], active_fingerprint[j_title]) if sim < min_Tc_total_actives: min_Tc_total_actives = sim return min_Tc_total_actives