Source code for schrodinger.application.glide.glideanalysis

"""
Functions and classes for calculating Glide Docking Enrichment Factors and
creating sorted pose libraries.

Copyright Schrodinger, LLC. All rights reserved.

"""

#Contributors: K Shawn Watts, Jeff Saunders

################################################################################
# Packages
################################################################################

import math
import os
import sys
import tempfile
from past.utils import old_div

import schrodinger.analysis.enrichment as enrichment
import schrodinger.structure as structure
import schrodinger.structutils.sort as sort
import schrodinger.utils.fileutils as fileutils

################################################################################
# Globals
################################################################################
_version = '$Revision: 1.21 $'


################################################################################
# Functions
################################################################################
[docs]def get_enrichment_calculator( file_name="", file_type=None, # 'pv' or 'lib' actives=[], # noqa: M511 # titles of known actives found in file_name report_header="", total_ligands=0, sort_property_1="r_i_glide_gscore", sort_property_2="r_i_glide_emodel", ): """ Returns an schrodinger.analysis.enrichment.Calculator instance. :type file_name: string :param file_name: The path to file upon which to operate. The 'file_name' pose file may contain multiple poses for each ligand, but the analysis requires culling to the best pose per ligand. Default is ''. :type file_type: string :param file_type: Should be 'pv' or 'lib'; used to determine the position of the first ligand pose. Default is 'pv'. :type actives: list of strings :param actives: Each string corresponds to the title of a known active used in the docking experiment. Default is []. :type total_ligands: int :param total_ligands: Integer number of the total number of ligands used in the experiment. Default is 0. :type report_header: string :param report_header: Not implemented. This parameter is ignored. :type sort_property_1: string :param sort_property_1: An m2io dataname that identifies the values for single pose per ligand sorting. Default is "r_i_glide_gscore". :type sort_property_2: string :param sort_property_2: An m2io dataname that identifies the values of intra-ligand group sorting. SP and HTVS poses should use 'r_i_glide_emodel' to pick the best pose of the same ligand-title. XP poses should use 'i_glide_XP_PoseRank' to determine the best pose within a ligand-title group. Default is "r_i_glide_emodel". This is a convenience interface for schrodinger.analysis.enrichment.Calculator, which is the preferred Enrichment metric generator. schrodinger.analysis.enrichment.Calculator has more metrics (e.g. BEDROC, ROC, AUAC), support for interactive plotting, and can generate plot png format files. This function also implements sort.py for better sorting performance. Two files are created as a result of running this function; a list of active compound titles in <root>_actives.txt and a sorted structure file in <root>_efcalc<ext>, where <root> is the basename and <ext> the extension of the filename provided. """ (root, ext) = fileutils.splitext(file_name) # Sort the structure file and write out just the 'best' pose per block. # Assign the starting index, pv_files have a receptor, libs don't # if the type was not specified guess by the file ext file_index = None if file_type: file_type = file_type.lower() if file_type == 'pv': file_index = 2 elif file_type == 'lib': file_index = 1 else: raise ValueError("Invalid file type: %s" % file_type) else: # Note, this is a weak test based on file extention. if fileutils.is_poseviewer_file(file_name): file_index = 2 else: file_index = 1 sort_criteria = [(prop, sort.ASCENDING) for prop in sort_property_1.split()] intra_block_sort_criteria = [('s_m_title', sort.ASCENDING)] intra_block_sort_criteria.extend([ (prop, sort.ASCENDING) for prop in sort_property_2.split() ]) sfs = sort.StructureFileSorter( file_name, file_index, sort_criteria, intra_block_sort_criteria, keep_structures=sort._in_memory_sort_ok(file_name)) sfs.sort() sorted_structure_file = '%s_efcalc%s' % (root, ext) sfs.writeTopNFromBlock(sorted_structure_file, max_per_block=1) # Write the active compound titles to disk. actives_file = '%s_actives.txt' % (root) actives_fh = open(actives_file, 'w') for title in actives: actives_fh.write("%s\n" % title) actives_fh.close() # Determine the number decoys used in the experiment. num_decoys = total_ligands - len(actives) ef_calc = enrichment.Calculator(actives=actives_file, results=sorted_structure_file, total_decoys=num_decoys) return ef_calc
################################################################################ # Classes ################################################################################ #ev81790 Removed the deprecated Enrichment class.
[docs]class PoseLibrary: """ A helper class to sort a pose library. By default, multiple poses for one ligand-title are 'block sorted'. i.e. grouped and sorted by Title and sort_property_2, and each group is sorted by its best member's sort_property_1. The pose library may be resorted by sort_property_1, in a title agnostic manner, by calling sortByProperties(). API Example:: pl = glideanalysis.PoseLibrary(file_name='input_lib.mae') pl.writeTopPoses('output_lib.mae') :vartype debug: bool :cvar debug: Increases the verbosity of the various bookkeeping operations. :vartype dock_results: dict :ivar dock_results: A dictionary of pose properties, pose file index keys for property dict. :vartype pose_lib_order: tuple :ivar pose_lib_order: A sorted tuple sequence of all pose file indexes. :vartype best_pose_lib_order: tuple :ivar best_pose_lib_order: A sorted tuple sequence of poses file indexes, from just the best pose of each title-group. :vartype title_tuples: list :ivar title_tuples: A list of (pose indexes) tuples, one tuple per title. :vartype title_blocks: dict :ivar title_blocks: A dictionary of pose index lists, 'title' keys for list of sorted (intra title) indexes. """ debug = False
[docs] def __init__( self, file_name="", file_type=None, # 'pv' or 'lib' sort_property_1="r_i_glide_gscore", sort_property_2="r_i_glide_emodel", block_sort=True): """ :type file_name: string :param file_name: Path to the file upon with to operate. The 'file_name' pose file may contain multiple poses for each ligand. Default is "". :type file_type: string :param file_type: Should be 'pv' or 'lib'; used to determine the position of the first ligand pose. If not specified, determined from ext. :type sort_property_1: string :param sort_property_1: An m2io dataname or a space-separated list of m2io datanames that identify the values for single best pose per ligand-title sorting. The input string is broken into fields by split(). Default is "r_i_glide_gscore". :type sort_property_2: string :param sort_property_2: An m2io dataname or a space-separated list of m2io datanames that identify the values for ligand-title group sorting. The input string is broken into fields by split(). Default is "r_i_glide_emodel". SP and HTVS poses should use 'r_i_glide_emodel' to pick the best pose of the same ligand-title. XP poses should use 'i_glide_XP_PoseRank' to determine the best pose order within a ligand-title group. :type block_sort: bool :param block_sort: Boolean for the initial sorting operation. The pose library can always be resorted at a later point, this attribute just sets the default sorting operation for the instance. If True, poses are organized into ligand-title grouping as normal. If False, the poses are organized by a straight multi-key sort of sort_property_1. If there are multiple poses per ligand title this option should be used with care. Default is True. """ self.file_name = file_name # Maestro file path self.file_type = file_type # lib or pv format, determined below self.sort_property_1 = sort_property_1.strip() self.sort_property_2 = sort_property_2.strip() self.file_index = 1 # Start file position index pv=2/lib=1 self.block_sort = block_sort # Store pose file properties and initial rank by title self.pose_lib_order = [] self.best_pose_lib_order = [] # multiple poses per ligand self.dock_results = {} self.statistic = {} # Assign the starting index, pv_files have a receptor, libs don't # if the type was not specified guess by the file ext if self.file_type: self.file_type = self.file_type.lower() if self.file_type == 'pv': self.file_index = 2 elif self.file_type == 'lib': self.file_index = 1 else: raise ValueError("Invalid file type: %s" % self.file_type) else: if fileutils.is_poseviewer_file(self.file_name): self.file_index = 2 self.file_type = 'pv' else: self.file_index = 1 self.file_type = 'lib' # Store structure file properties by initial file position index index = self.file_index for st in structure.StructureReader(self.file_name, self.file_index): self.dock_results[index] = {} for p in list(st.property): self.dock_results[index][p] = st.property[p] index += 1 if self.block_sort: # Legacy mode of sorting puts the poses into ligand-title blocks. self.sortPoses() else: # Simple multi-key sorting, no ligand-title blocks self.sortByProperties() # Undecorate and sort the pose library by imposing this index order # on file. self.pose_lib_order = tuple(self.pose_lib_order) self.best_pose_lib_order = tuple(self.best_pose_lib_order) self.title_tuples = tuple(self.title_tuples) # Collect simple statistics on the library self.calculate_stats() return
def _show(self): """ Helper/debugging method to print out values. Prints table of sorted poses. """ print("PoseLibrary: Read %s, with type %s, at file index %d" % (self.file_name, self.file_type, self.file_index)) print("PoseLibrary: Found %d total poses" % len(self.dock_results)) print("Final Poses file index sorted order (All poses)") print(self.pose_lib_order) print("Final Poses file index sorted order (Best pose per ligand)") print(self.best_pose_lib_order) print("Table of Final Poses file index sorted order (All poses)") fields = [] fields.append('Title') fields.extend(self.sort_property_1.split()) fields.extend(self.sort_property_2.split()) fields.append('file_index') print(" ".join(fields)) for i in self.pose_lib_order: row = [self.dock_results[i]['s_m_title']] for p in self.sort_property_1.split(): row.append(str(self.dock_results[i][p])) for p in self.sort_property_2.split(): row.append(str(self.dock_results[i][p])) row.append(str(i)) print("\t".join(row)) return
[docs] def sortPoses(self): """ Helper method that calls sortIntraTitle(), then sortInterTitle(). No return value. """ self.sortIntraTitle() self.sortInterTitle() self.pose_lib_order = tuple(self.pose_lib_order) self.best_pose_lib_order = tuple(self.best_pose_lib_order) self.title_tuples = tuple(self.title_tuples) return
[docs] def sortIntraTitle(self): """ Creates tuples of poses with the same title, then sorts within each title-tuple by self.sort_property_2. No return value. Ligands are not 'ranked' between titles by this function. Raises an Exception if a pose without a title is encountered. """ self.title_tuples = [] # A list of index tuples self.title_blocks = {} # Helper dict props = self.sort_property_2.split() sort_p_2 = [] # List of lists indexes = [] # List of file positions with the same title titles = [] # Seed list of lists sort_p_2.append(titles) # slice 0 for p in props: sort_p_2.append([]) sort_p_2.append(indexes) # slice -1 # Create ligand-title blocks for file_index in self.dock_results: title = self.dock_results[file_index].get('s_m_title') if not title: msg = 'No title for %d' % file_index raise Exception(msg) # Populate the list of lists sort_p_2[0].append(title) for prop_index, prop in enumerate(props): prop_index += 1 # title is 0th slice so bump the index lst = sort_p_2[prop_index] lst.append(self.dock_results[file_index].get(prop)) sort_p_2[-1].append(file_index) # Decorate, sort, undecorate index title_family = list(zip(*sort_p_2)) title_family.sort() for item in title_family: title = item[0] index = item[-1] # Check for existing hash key if title in self.title_blocks: self.title_blocks[title].append(index) else: self.title_blocks[title] = [] self.title_blocks[title].append(index) # Title_blocks dict was useful for debugging/development, but # a list of tuples is nicer for sorting Inter-Title for title in self.title_blocks: self.title_tuples.append(tuple(self.title_blocks[title])) return
[docs] def sortInterTitle(self): """ Orders the title_tuple families (see `sortIntraTitle`) by self.sort_property_2. No return value. """ # Reset as an appendable lists self.best_pose_lib_order = [] props = self.sort_property_1.split() # Sort the tuples by their leading member's self.sort_property_1 families = [] sort_p_1 = [] # seed list of lists for p in props: sort_p_1.append([]) sort_p_1.append(families) # last slice, -1 # Populate the list of lists for item in self.title_tuples: best_member_index = item[0] for prop_index, prop in enumerate(props): lst = sort_p_1[prop_index] lst.append(self.dock_results[best_member_index].get(prop)) sort_p_1[-1].append(item[:]) # Copy the family index order # Decorate, sort, undecorate index tuples pose_list = list(zip(*sort_p_1)) pose_list.sort() self.title_tuples = [] # Reset title_tuples as an appendable list for item in pose_list: family = item[-1] self.title_tuples.append(family[:]) # Pick the first element to get the best ligand order for item in self.title_tuples: self.best_pose_lib_order.append(item[0]) # Flatten the tuples into a list of all self.pose_lib_order = [] for item in self.title_tuples: self.pose_lib_order.extend(list(item[:])) return
[docs] def sortByProperties(self): """ Orders the pose library in a Title agnostic manner, considering only sort_property_1; a simple (multi-key) sort of the docking results. The instance attribute 'pose_lib_order' is reassigned to the new ordered tuple, but since the block ordering is lost the attributes 'best_pose_lib_order' and 'title_tuples' are redefined as empty tuples, and 'title_blocks' is redefined to an empty dictionary. """ # Reset irrelevant data structures self.best_pose_lib_order = () self.title_tuples = () self.title_blocks = {} # Reset as an appendable list self.pose_lib_order = [] props = self.sort_property_1.split() sort_p_1 = [] # List of lists indexes = [] # List of file positions # Seed list of lists for p in props: sort_p_1.append([]) sort_p_1.append(indexes) # slice -1 # Populate the list of lists for pose_index in self.dock_results: for prop_index, p in enumerate(props): sort_p_1[prop_index].append( self.dock_results[pose_index].get(p)) sort_p_1[-1].append(pose_index) # Decorate, sort, undecorate index pose_organizer = list(zip(*sort_p_1)) pose_organizer.sort() for item in pose_organizer: self.pose_lib_order.append(item[-1]) self.pose_lib_order = tuple(self.pose_lib_order) return
[docs] def calculate_stats(self): """ Calculate simple statistics on the glide scores. Sets the self.statistic dictionary and adds the standard score (glide gscore) to the self.dock_results dict. Returns None. """ # List comprehension of glide scores glide_score = [ self.dock_results[i]['r_i_glide_gscore'] for i in self.dock_results ] glide_score_mean = old_div(sum(glide_score), len(glide_score)) glide_score_stdev = math.sqrt( old_div(sum([(i - glide_score_mean)**2 for i in glide_score]), len(glide_score))) glide_score_min = min(glide_score) glide_score_max = max(glide_score) # Median score. // is a floor division operator glide_score_median = glide_score[len(glide_score) // 2] self.statistic['glide_gscore_min'] = glide_score_min self.statistic['glide_gscore_max'] = glide_score_max self.statistic['glide_gscore_mean'] = glide_score_mean self.statistic['glide_gscore_stdev'] = glide_score_stdev self.statistic['glide_gscore_median'] = glide_score_median for i in self.dock_results: score = self.dock_results[i]['r_i_glide_gscore'] std_score = None if glide_score_stdev > 0: std_score = float( old_div((score - glide_score_mean), glide_score_stdev)) self.dock_results[i]['r_user_std_glide_gscore'] = std_score return
[docs] def write(self, output_lib_name="", indexes=(), block_size=4000): """ Outputs an ordered pose lib file. The pose order is the same as indexes' sequence of original file positions. This method loads all pose structure.Structure objects into memory only if there are fewer than 'block_size' structures. Otherwise, it reads the input file multiple times, sorting structures in 'block_size' chunks until the entire library is sorted. Returns None. """ pose_lib_indexes = list(indexes) pose_indexes = [] sts = [] (self.tmp_lib_desc, self.tmp_lib) = tempfile.mkstemp( #"pose_library__tmp_UNIQUEGIBBERISH.mae" suffix='.mae', prefix='pose_library_tmp_', dir=os.getcwd(), text=True) # See if the file size is bigger than the block size, we don't # have to count them all, just determine if we have more than the # block size. enum is 0 based, so we add 1. for count, st in enumerate(structure.StructureReader(self.file_name)): if count + 1 > block_size: break # 'Normal' sized file, should be sortable within memory if count < block_size: if self.debug: print("Sort all structures at once, read from file index: %d" \ % self.file_index) file_index = self.file_index for st in structure.StructureReader(self.file_name, self.file_index): # Skip any index not in the list if file_index in pose_lib_indexes: sts.append(st) pose_indexes.append(pose_lib_indexes.index(file_index)) file_index += 1 sorted_sts = list(zip(pose_indexes, sts)) sorted_sts.sort() for (i, st) in sorted_sts: st.append(self.tmp_lib) # 'Big' files can't be sorted in memory at once, so sort in batches else: if self.debug: print("Sorting structures in batches") while pose_lib_indexes: islice = [] for i in range(1, block_size): try: islice.append(pose_lib_indexes.pop(0)) except IndexError: break # exit for loop if self.debug: print("Sorting slice:") print(islice) sts = [] pose_indexes = [] sorted_sts = [] file_index = self.file_index for st in structure.StructureReader(self.file_name, self.file_index): if file_index in islice: sts.append(st) pose_indexes.append(islice.index(file_index)) file_index += 1 sorted_sts = list(zip(pose_indexes, sts)) sorted_sts.sort() for (i, st) in sorted_sts: st.append(self.tmp_lib) os.close(self.tmp_lib_desc) if sys.platform == 'win32' and os.path.exists(output_lib_name): os.remove(output_lib_name) os.rename(self.tmp_lib, output_lib_name) return
[docs] def writeValidPoses( self, output_lib_name="", actives=[], # noqa: M511 max_pose_per_active=10, max_pose_per_decoy=1, max_number_decoys=1000, block_size=2000): """ Writes a pose library with multiple poses per active but a single pose per 'decoy'. Returns None. """ # Container for pose indexes valid_poses = [] # Container for pose accounting valid_count = {} valid_count['__decoys'] = 0 for i in self.pose_lib_order: title = self.dock_results[i]['s_m_title'] if title not in valid_count: valid_count[title] = 0 if title in actives and valid_count[title] < max_pose_per_active: valid_poses.append(i) valid_count[title] += 1 elif valid_count[title] < max_pose_per_decoy and \ valid_count['__decoys'] < max_number_decoys: valid_poses.append(i) valid_count[title] += 1 valid_count['__decoys'] += 1 self.write(output_lib_name, indexes=valid_poses, block_size=block_size) return
[docs] def writeTopPoses(self, output_lib_name="", max_pose_per_title=1, block_size=2000): """ Writes a pose library with the top N poses per ligand. Returns None. Assumes self.title_tuples is sorted in the desired order. """ # Set the number of poses to grab N = max_pose_per_title # Container for pose indexes indexes = [] # Get the pose indexes for item in self.title_tuples: indexes.extend(item[:N]) # Verbose if self.debug: print("writeTopPoses... max_pose_per_title: ", N) print("writeTopPoses... indexes:") print(indexes) # Write them out self.write(output_lib_name, indexes=indexes, block_size=block_size) return
# EOF