Source code for schrodinger.pipeline.stages.glide

# python module
"""
Glide Docking & Gridgen stages.

    DockingStage - stage for running Glide docking jobs

    GridgenStage - stage for generating Glide grids (not functional)
        HBOND_CONSTRAINTS: List of constraints in format: "label <atomnum>"
        METAL_CONSTRAINTS: List of constraints in format: "label <atomnum>"
        POSIT_CONSTRAINTS: List of constraints in format: "label <x> <y> <z> <radius>"
        NOE_CONSTRAINTS: List of constraints in format: "label <x> <y> <z> <min> <max>"

Copyright Schrodinger, LLC. All rights reserved.

"""

import csv
import gzip  # For compress()
import os
import re
import shutil
import sys
import tarfile  # For cleanup
import time

import schrodinger.utils.subprocess as subprocess
from schrodinger import structure
from schrodinger.application.glide import glide
from schrodinger.application.glide import glideanalysis
from schrodinger.pipeline import pipeio
from schrodinger.pipeline import pipeutils
from schrodinger.pipeline import stage
from schrodinger.utils import csv_unicode
from schrodinger.utils import fileutils

glide_path = 'glide'
GLIDE_MERGE = os.path.join(os.environ['SCHRODINGER'], 'utilities',
                           'glide_merge')


[docs]def compress(origFile): """ Compresses the specified file and removes the original """ if origFile.endswith('.mae'): compFile = origFile + 'gz' # To produce .maegz else: compFile = origFile + '.gz' orig_handle = open(origFile, 'r') gzip_handle = gzip.GzipFile(compFile, mode='w') for line in orig_handle: gzip_handle.write(line) orig_handle.close() gzip_handle.close() fileutils.force_remove(origFile) return compFile
[docs]class DockingStage(stage.Stage): """ VSW stage for performing Glide docking """
[docs] def __init__(self, *args, **kwargs): # For these specs, None represents MMIM default # Use the specification that $SCHRODINGER/glide uses, except append # a few VSW-specific keywords: docking_specs = "\n".join(glide.validation_specs('docking')) specs = docking_specs + """ RECOMBINE = boolean(default=True) # Whether to recombine ligand files UNIQUEFIELD = string(default="s_m_title") OUTVARIANTFIELD = string(default=None) # Field for generated variant codes NUM_TO_KEEP = integer(default=0) # Keep at most this many output ligands PERCENT_TO_KEEP = float(default=0.0) # Keep only this percentage of the (input) ligands COMPRESS_IN_FILES = boolean(default=True) # Whether to compress subjob *_in files BEST_BY_TITLE = boolean(default=False) # Whether to keep only the best scoring pose per compound at the end. LVDW = float(default=0.8) # Old name for LIG_VSCALE. glide.py will recognize it. LIGCCUT = float(default=0.15) # Old name for LIG_CCUT. glide.py will recognize it. LIGAND_CONFS = string(default=None) # Old name for DOCKING_METHOD. glide.py will recognize it. DIELECTRIC = float(default=None) # Old name for GLIDE_DIELCO (ignored) # Metal and H-bond constraint options: [CONSTRAINT_GROUP:1] USE_CONS = string() NREQUIRED_CONS = string() [CONSTRAINT_GROUP:2] USE_CONS = string() NREQUIRED_CONS = string() [CONSTRAINT_GROUP:3] USE_CONS = string() NREQUIRED_CONS = string() [CONSTRAINT_GROUP:4] USE_CONS = string() NREQUIRED_CONS = string() [FEATURE:1] __many__ = string() [FEATURE:2] __many__ = string() [FEATURE:3] __many__ = string() [FEATURE:4] __many__ = string() [FEATURE:5] __many__ = string() """ stage.Stage.__init__(self, specs=specs, *args, **kwargs) self.setMainProduct('glide') self.addExpectedInput(1, "structures", True) self.addExpectedInput(2, "grid", True) self.addExpectedOutput(1, "structures", True) # Input ligands after GENCODES self.addExpectedOutput(2, "structures", False) # BEST_BY_TITLE - GUI match: # 'Only best scoring pose':'YES' # 'All good scoring poses':'NO' # 'All states':'NO' self.recombined_ligands = [] self.docked_ligands = {} # Key: jobname, value: output ligand file self.jobdj = None # Whether to continue with jobs in the event of subjob failure. The # default is True and can be overridden from the command line using the # noforce option. See VSW-886 self._force_jobs = True self.status = "NOT STARTED" self.input_compound_count = None self.subjob_inposes = {}
[docs] def checkInputs(self): in2 = self.getInput(2) in2.check() if not in2.getPath(): raise RuntimeError("Docking Stage: No grid in Input Object 2!") return True
[docs] def recombineInputLigands(self): # Run modes: # 1. Recombining - and generating codes # 2. Recombining - and not generating codes # 3. Not Recombining - and not generating codes # These keywords are ignored if not recombining: self.uniquefield = self['UNIQUEFIELD'] variantfield = self['OUTVARIANTFIELD'] self.gencodes = (variantfield is not None) recombine = self['RECOMBINE'] if self.gencodes and not recombine: # Ev:76572 self.info( "WARNING: Since variant codes need to be generated, will recombine" ) recombine = True if not recombine: # No recombine (Ev:65662) if self.gencodes: raise RuntimeError( "ERROR: OUTVARIANTFIELD is not supported if RECOMBINE is FALSE" ) # VSW-813 if self.uniquefield.upper() == 'NONE': raise RuntimeError( "ERROR: UNIQUEFIELD can't be set to NONE if RECOMBINE is FALSE" ) # This is because the docking subjob may produce more than one pose per # input compound. Since no unique-field is specified, it is not possible # to know how to group the poses into compounds. self.info("Will NOT recombine or retitle input ligands") self.recombined_ligands = self.getInput(1).getFiles() self.info("There will be %i subjobs." % len(self.recombined_ligands)) self.info("Will identify ligands based on unique field: %s" % self.uniquefield) self.info( "Counting numbers of structures and compounds in input ligands..." ) self.subjob_inposes = {} # number of poses in each input job filenum = 0 st_num = 0 # Count the number of poses in each input file and total number of compounds: # Ev:105233 compound_ids = set() for filename in self.recombined_ligands: filenum += 1 # Count the number of structures & compounds in this file: file_count = 0 file_st_num = 0 for st in pipeutils.get_reader(filename, astext=True): st_num += 1 file_st_num += 1 file_count += 1 # Write progress period: if st_num % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() # Get compound code: try: # Ev:96648 & Ev: 104439 compound_id = pipeutils.read_unique_field( st, self.uniquefield) compound_ids.add(compound_id) except KeyError: err = "No field %s in structure #%i (file %s)!" % ( self.uniquefield, file_st_num, filename) raise RuntimeError(err) self.subjob_inposes[filenum] = file_count number_of_compounds = len(compound_ids) # Otherwise, the number_of_compounds was already manually # incremented self.input_compound_count = number_of_compounds self.info('') # To have a new line after the progress periods self.info("Number of input compounds: %i" % self.input_compound_count) self.info("Number of input structures: %i" % st_num) msg = "\nInput number of poses for each subjob:" msg += "\n%s" % str(self.subjob_inposes) self.info(msg) # Ev:118766 self.info( "\nNOTE: %i jobs will be launched, as that's the number of input files. (The option to 'use input files directly for subjobs' was used.)\n" % len(self.subjob_inposes)) return # IF GOT HERE THEN RECOMBINING if self.gencodes: if self.uniquefield.upper() == "NONE": self.info( "Will treat each input structure as its own compound (renumbering ligands)." ) else: self.info("Will identify ligands based on unique field: %s" % self.uniquefield) self.info("Will save generated variant codes into field: %s" % variantfield) else: self.info("Will identify ligands based on field: %s" % self.uniquefield) method = self['DOCKING_METHOD'].lower() if method not in ['confgen', 'rigid', 'mininplace', 'inplace']: raise RuntimeError("ERROR: Invalid confs option:%s" % method) # The limits are set so that smallest job will take 5-15 minutes. # The limits are set so that largest job will take 20-30 hours. # Bigger numbers mean faster calculation time min_job_size, max_job_size = None, None precision = self['PRECISION'].upper() if precision not in ['HTVS', 'SP', 'XP']: raise RuntimeError("Invalid docking PRECISION: %s" % precision) # Determine the approximate size of a 1 hour job: if precision == "HTVS": if method == "inplace" or method == "mininplace": msg = "ERROR: %s unavailable for %s" % (method, precision) raise RuntimeError(msg) elif method == "rigid": min_job_size = 8000 elif method == "confgen": min_job_size = 4000 elif precision == "SP": if method == "inplace": # ~25min (Try to keep file sizes under 10K) min_job_size = 10000 elif method == "mininplace": min_job_size = 5000 # 8,000 ~ 1 hour; 5,000 ~ 40min elif method == "rigid": min_job_size = 2000 elif method == "confgen": min_job_size = 300 elif precision == "XP": if method == "inplace": min_job_size = 4000 elif method == "mininplace": min_job_size = 100 elif method == "rigid": min_job_size = 200 elif method == "confgen": min_job_size = 20 if min_job_size is None: # Should never happen msg = "ERROR: Unknown precision/method combintation: %s/%s" % ( precision, method) raise RuntimeError(msg) max_job_size = min_job_size * 10 # about 10 hours input_files = self.getInput(1).getFiles() try: self.compress_in = self['COMPRESS_IN_FILES'] except: # older input file self.compress_in = False initial_njobs, adjust_njobs = self.getNJobs() # ^--- whether or not to adjust the number of jobs for optimal job size self.info(" Separating ligands into %i subjobs." % initial_njobs) if adjust_njobs: self.info(" (will adjust if needed)") else: self.info(" (will not adjust)") # Algorithm if adjusting the number of subjobs: # 1. Write to first file until min_job_size is reached, # 2. Create another file and write to it until minimum is reached # 3. Repeat step #2 until NJOBS # of files are created. # 4. Add ligands to above files until max_job_size is reached # 5. After that, start creating additional files. self.subjob_compounds = {} # Number of compounds in each input job self.subjob_inposes = {} # Number of poses in each input job recombined_writers_dict = {} # keys are filenums compound_id = None filenum = 0 # First structure will be writtne to filenum #1 # The compound code of the previous structure: current_compound_id = None current_compound_id_variant = 0 number_of_compounds = 0 st_num = 0 # Processing ligand # reader = structure.MultiFileStructureReader(input_files) for st in reader: if st.property.get("b_glide_receptor"): # Skip receptors in PV files continue st_num += 1 # Write progress period: if st_num % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() if self.uniquefield.upper() == 'NONE': compound_id = str(st_num) st.property["i_vsw_st_num_backup"] = st_num else: # Get compound code: try: # Ev:96648 & Ev: 104439 compound_id = pipeutils.read_unique_field( st, self.uniquefield) except KeyError: err = "No field %s in structure #%i (file %s)!" % ( self.uniquefield, reader.index_in_current_file, reader.current_filename) raise RuntimeError(err) if compound_id != current_compound_id or current_compound_id is None: # New compound: current_compound_id_variant = 1 number_of_compounds += 1 current_compound_id = compound_id else: # Another variant of the same compound: current_compound_id_variant += 1 # Replacement for GenCodes (Ev:65971) if self.gencodes: # Regenerating titles & variant codes: # FIXME take this code out st.property["s_pipeline_title_backup"] = st.title st.title = str(compound_id) # add ligand variant property based on number of poses counted # for the current compound. s = "%s-%i" % (compound_id, current_compound_id_variant) st.property[variantfield] = s # If the compound_id (compound code) for this ligand is the same as # for the previous ligand, write them to the same subjob file. if current_compound_id_variant == 1: # This compound is different from a previous compound. # (will also happen for the first structure) # Consider writing it to a different file. # Determine what the output file for next ligand will be: if filenum == 0: # Adjusting or not; start with first file: filenum = 1 elif not adjust_njobs: # Do not adjust njobs, mix the ligands: filenum += 1 if filenum > initial_njobs: filenum = 1 else: # Adjusting NJOBS (and not the first structure) # Stay in first file if in under-min zone # Go to next EXISTING file if in min-max zone # Create a NEW file if in over-max zone curr_poses = self.subjob_inposes[filenum] if curr_poses >= min_job_size: # min-max or over-max if curr_poses >= max_job_size: # Already wrote maximum number of structures to # this file. Create a new file: filenum += 1 else: # Go to the next EXISTING file: filenum += 1 if filenum > initial_njobs: filenum = 1 else: # Keep writing poses to the first input file pass # Either way, another compound was added to <filenum> try: self.subjob_compounds[filenum] += 1 except KeyError: self.subjob_compounds[filenum] = 1 else: # Same compound as previous structure pass # Otherwise write to same file as previous structure # Write ligand to current file: if filenum not in recombined_writers_dict: # first ligand filename = self.genFileName(filenum=filenum, extension="_in.mae") if self.compress_in: filename += 'gz' writer = structure.StructureWriter(filename) writer.filename = filename recombined_writers_dict[filenum] = writer recombined_writers_dict[filenum].append(st) try: self.subjob_inposes[filenum] += 1 except KeyError: self.subjob_inposes[filenum] = 1 self.input_compound_count = number_of_compounds self.info('') # To have a new line after the progress periods self.info("Number of input compounds: %i" % self.input_compound_count) self.info("Number of input structures: %i" % st_num) if len(recombined_writers_dict) != initial_njobs: if adjust_njobs: self.info("Number of subjobs was adjusted to %i." % len(recombined_writers_dict)) else: self.info("Number of output subjobs: %i" % len(recombined_writers_dict)) num_list = [] for i, numposes in self.subjob_inposes.items(): num_cmpds = self.subjob_compounds[i] # NUM:(COMPOUNDS, STRUCTURES) num_list.append("%i:(%i, %i)" % (i, num_cmpds, numposes)) msg = "\nInput number of (compounds, structures) for each subjob:" msg += "\n%s" % ", ".join(num_list) self.info(msg) # Close file handles: recombined_files = [] for writer in recombined_writers_dict.values(): recombined_files.append(writer.filename) writer.close() self.recombined_ligands = recombined_files self.info('') self.checkFiles(self.recombined_ligands)
[docs] def setupJobs(self): """ Docks the structures in ligfiles to the grid <gridprefix>. """ self.jobdj = self.getJobDJ() self.glide_jobnames = [] gridfile = self.getInput(2).getPath() output_type = self.get('POSE_OUTTYPE', 'pv') poses_per_lig = self.get('POSES_PER_LIG') if poses_per_lig is None: poses_per_lig = 1 if output_type.lower() in ['pv', 'poseviewer']: self.out_pv = True elif output_type.lower() in ['lib', 'ligandlib']: self.out_pv = False else: raise RuntimeError("Bad output type: %s" % output_type) # Default for this keyword in glide.py is None (Use MMIM default) # MMIM default is True. self.compress_poses = self['COMPRESS_POSES'] if self.compress_poses is None: # Keyword not specified Ev:87161: self.compress_poses = True for i, ligfile in enumerate(self.recombined_ligands): filenum = i + 1 # starts at 1 # Just so that we keep all of the output. nreport = self.subjob_inposes[filenum] * poses_per_lig glide_jobname = self.genFileName(filenum=filenum) input_file = self.write_glide_input_file(ligfile, gridfile, nreport, glide_jobname) cmd = [glide_path, '-RESTART', input_file] self.debug("CMD: %s" % cmd) self.jobdj.addJob(cmd) self.glide_jobnames.append(glide_jobname)
[docs] def processJobOutputs(self): docked_ligand_files = [] for filename in self.docked_ligands.values(): if filename is not None: # If at least one ligand from this subjob docked: docked_ligand_files.append(filename) if not docked_ligand_files: msg = "Glide subjob(s) did not produce any poses" self.error(msg) #raise RuntimeError(msg) self.setOutput(1, pipeio.Structures([])) return sys.stdout.flush() num_comps_to_keep = self['NUM_TO_KEEP'] percent_to_keep = self['PERCENT_TO_KEEP'] if num_comps_to_keep and percent_to_keep: raise RuntimeError( '"NUM_TO_KEEP" and "PERCENT_TO_KEEP" are mutually exclusive') if not num_comps_to_keep and not percent_to_keep: percent_to_keep = 100.0 if num_comps_to_keep: if num_comps_to_keep > self.input_compound_count: num_comps_to_keep = self.input_compound_count msg = " Keeping %i of %i compounds." % (num_comps_to_keep, self.input_compound_count) self.info(msg) sys.stdout.flush() else: int_percent = int(percent_to_keep) num_comps_to_keep = int(percent_to_keep * self.input_compound_count / 100) if num_comps_to_keep < 1: num_comps_to_keep = 1 if num_comps_to_keep > self.input_compound_count: num_comps_to_keep = self.input_compound_count if int_percent == percent_to_keep: self.info(" Keeping %i" % int_percent + "% of compounds.") else: self.info(" Keeping %f" % percent_to_keep + "% of compounds.") msg = " Actually keeping %i of %i compounds." % ( num_comps_to_keep, self.input_compound_count) self.info(msg) sys.stdout.flush() combined_logfile = self.genFileName(extension="_combined.log") combined_logfile_fh = open(combined_logfile, 'w') best_by_compound = self['BEST_BY_TITLE'] self.info(" Best by compound: " + str(best_by_compound)) sys.stdout.flush() total_cpu_time = 0 # seconds myre = re.compile(r'Total\sCPU\stime\s=\s+(\d+?\.\d+?)') self.info(" Calculating CPU time and combining log files...") for glide_jobname, outfile in self.docked_ligands.items(): logfile = glide_jobname + '.log' if outfile is None: # This subjob did not produce any poses continue # Merge the *.log file: if not os.path.isfile(logfile): self.error(" Glide log file is missing: %s" % logfile) combined_logfile_fh = None elif combined_logfile_fh: # log file exists and all OK till now: self.debug(" Combining %s..." % logfile) try: with open(logfile) as fh: for line in fh: match = myre.search(line) if match: total_cpu_time += float(match.group(1)) combined_logfile_fh.write(line) except IOError: self.error(" Error merging Glide log file") combined_logfile_fh = None # Close the file handle if all files merged successfully; if there # was a problem, remove the file altogether: if combined_logfile_fh: combined_logfile_fh.close() elif os.path.isfile(combined_logfile): fileutils.force_remove(combined_logfile) total_cpu_time = None if total_cpu_time: sec = int(total_cpu_time * 10) / 10.0 hrs = int(sec // 3600) msg = "\nAll Glide subjobs: Total CPU time = %i seconds ( %i hours)" % ( sec, hrs) self.info(msg) total_cpu_time = int(total_cpu_time) sec = total_cpu_time % 60 mins = ((total_cpu_time - sec) // 60) % 60 hrs = (total_cpu_time - sec - mins * 60) // 3600 msg = " Human readable = %i hours, %i minutes, %i seconds\n" % ( hrs, mins, sec) self.info(msg) sys.stdout.flush() else: self.info( " have not calculated CPU time or generated combined logfile" ) sys.stdout.flush() merged_file, num_merged_sts = self.mergeGlideOutputs( docked_ligand_files, num_comps_to_keep, best_by_compound) self.info('') # Line break if merged_file is None: # May happen in SIP mode, if no poses docked. VSW-926 self.setOutput(1, pipeio.Structures([])) else: self.setOutput(1, pipeio.Structures([merged_file], num_merged_sts)) if self.gencodes and self.outputRequested(2): self.setOutput(2, pipeio.Structures(self.recombined_ligands)) self.addOutputFile(combined_logfile) self.info("Combined logfile: " + combined_logfile) sys.stdout.flush()
[docs] def write_glide_input_file(self, ligfile, gridfile, nreport, glide_jobname): """ Write Glide docking job input file """ self.checkFile(ligfile) precision = self['PRECISION'] if (precision == "HTVS"): mode = 'htvs' elif (precision == "SP"): mode = 'sp' elif (precision == "XP"): mode = 'xp' else: raise RuntimeError("ERROR: Invalid PRECISION value: %s" % precision) if self['WRITE_XP_DESC'] and mode == 'htvs': msg = "ERROR: Can't write XP descriptors for %s mode." % precision raise RuntimeError(msg) # Setup keywords to pass to glide.Dock: tmp_config = glide.Dock({}) keywords = {} # Pass all non-VSW specific keywords to Glide: for item in glide.validation_specs('docking'): if not item: continue key = item.split()[0] value = self.get(key) if value != tmp_config.default_values[key]: # Add only non-default values VSW-888: keywords[key] = value # Overwrite some keywords: keywords['GRIDFILE'] = gridfile keywords['JOBNAME'] = glide_jobname keywords['LIGANDFILE'] = ligfile keywords['PRECISION'] = precision keywords['NREPORT'] = nreport # Overwrite POSE_OUTPUT keyword: if self.out_pv: keywords['POSE_OUTTYPE'] = 'poseviewer' else: keywords['POSE_OUTTYPE'] = 'ligandlib' if keywords.get('GSCORE_CUTOFF') == 'None': # older versions of vswinput.py wrote "GSCORE_CUTOFF None" to the # input file, and "None" is not a valid float (note that this is # actually the string "None", not the Python None object). del keywords['GSCORE_CUTOFF'] # Pass lists of HBOND & METAL constraints to $SCHRODINGER/glide: constraints = self['USE_CONS'] # USE_CONS specified (no constraint groups): if constraints: nrequired_str = self['NREQUIRED_CONS'] keywords['CONSTRAINT_GROUP:1'] = \ {'USE_CONS': constraints, 'NREQUIRED_CONS': nrequired_str } # Pass CONSTRIANT_GROUP & FEATURE keywords to glide.py: for name, section in self.items(): if name.startswith('CONSTRAINT_GROUP') and section: keywords[name] = dict(section) if name.startswith('FEATURE') and section: keywords[name] = dict(section) msg = '*******************\nPassing to glide.py: %s\n*******************' % str( keywords) self.debug(msg) # Write the input file: input_file = glide.Dock(keywords).writeSimplified() # Make sure the file exists: self.checkFile(input_file) return input_file
[docs] def mergeGlideOutputs(self, input_files, num_comps_to_keep, best_by_compound): """ Takes in a list of _lib.mae or _pv.mae files, and merges them together based on glidescore. Assumes that the files have already been sorted with glide_sort (default in Glide). :param input_files: List of docked pose files (PV or LIB). :type input_files: list of str :param num_comps_to_keep: Stop merging when this number of compounds have been reached. :type num_comps_to_keep: int :param best_by_compound: whether to keep only the best scoring pose per compound (based on unique ID property). :return: Merged file path and number of structures in it (including rec). If none of the structures docked (SIP mode), will return (None, 0). :rtype: (str, int) """ inputs_file = self.genFileName(extension="_files_to_merge.txt") fh = open(inputs_file, 'w') assert len(input_files) > 0 for filename in input_files: fh.write(filename + '\n') fh.close() assert os.path.isfile(inputs_file) if self.out_pv: out_ext = '_pv.mae' else: out_ext = '_lib.mae' if self.compress_poses: out_ext += 'gz' merged_file = self.genOutputFileName(1, extension=out_ext) if self.uniquefield.upper() == 'NONE': id_prop = "i_vsw_st_num_backup" else: id_prop = self.uniquefield cmd = [ GLIDE_MERGE, '-f', inputs_file, '-m', str(num_comps_to_keep), '-id_prop', id_prop, '-o', merged_file ] if best_by_compound: cmd += ['-p', '1'] # FIXME # In case NREPORT keyword was specified for this docking stage: # nreport = self.get('NREPORT', 0) # if nreport: # cmd += ['-n', str(nreport)] self.info("Merging the Glide output files...") self.info(subprocess.list2cmdline(cmd)) gm_log = os.path.abspath('glide_merge.log') self.info("For progress, see: %s" % gm_log) with open(gm_log, 'w') as fh: p = subprocess.Popen(cmd, stderr=fh, stdout=fh, universal_newlines=True) ret = p.wait() self.info(pipeutils.get_last_20_lines(gm_log)) if ret != 0: self.exit("glide_merge failed; see %s" % gm_log) with open(gm_log) as fh: stdout = fh.read() if "** NO ACCEPTABLE LIGAND POSES WERE FOUND **" in stdout: return None, 0 else: self.info("glide_merge completed suffessfully") self.checkFile(merged_file) # Create the CSV file and count the number of compounds and sts: self.info("Reading merged structures...") # File that will need to be post-processed to generate the CSV: # This file is in format: propname1, value1, propname2, value2, ... pre_csv_file = self.genFileName(extension="_tmp.csv") csv_props = set() # Set of props in all structures num_kept_structures = 0 kept_compounds = set() with csv_unicode.writer_open(pre_csv_file) as precsvfh: precsvwriter = csv.writer(precsvfh) for st in pipeutils.get_reader(merged_file, astext=True): if st.property.get("b_glide_receptor"): continue # Write the properties to the pre-csv file: row = [] for propname, value in st.property.items(): row += [propname, value] csv_props.add(propname) precsvwriter.writerow(row) num_kept_structures += 1 kept_compounds.add(st.property[id_prop]) assert num_kept_structures > 0 num_kept_compounds = len(kept_compounds) assert os.path.isfile(pre_csv_file) if num_kept_compounds < num_comps_to_keep: self.warning("WARNING: Number of Glide output compounds is less " "than the requested number of compounds to keep") self.info(" Number of compounds expected to keep: %i" % num_comps_to_keep) self.info(" Number of compounds kept: %i" % num_kept_compounds) self.info(" Number of structures kept: %i" % num_kept_structures) self.generateCsvFile(pre_csv_file, csv_props) fileutils.force_remove(pre_csv_file) fileutils.force_remove(inputs_file) if self.out_pv: num_kept_structures += 1 return merged_file, num_kept_structures
[docs] def generateCsvFile(self, pre_csv_file, csv_props): self.info("Generating csv report file...") # Generate the CSV file from the pre-csv file: csv_file = self.genOutputFileName(1, extension='.csv') with csv_unicode.reader_open(pre_csv_file) as precsvfh, \ csv_unicode.writer_open(csv_file) as csvfh: csvwriter = csv.writer(csvfh) csv_props_list = list(csv_props) csvwriter.writerow(csv_props_list) for predata in csv.reader(precsvfh): # Generate a dict from the properties for this ct: predict = {} key = None for item in predata: if key: predict[key] = item key = None else: key = item # Convert this dict to a row in the actual CSV file: newdata = [] for prop in csv_props_list: value = predict.get(prop, '') newdata.append(value) csvwriter.writerow(newdata) self.info('Properties CSV file: %s' % csv_file)
[docs] def check_subjob_output(self, glide_jobname): """ Will return the output file for the specified subjob. If no valid poses were produced, None is returned. Raises a descriptive RuntimeError if logfile and/or output file is incomplete. """ logfile = glide_jobname + '.log' if not os.path.isfile(logfile): msg = "Both output and log files are missing for job: %s" % glide_jobname raise RuntimeError(msg) outfile = self._find_subjob_output(glide_jobname) if outfile: return outfile # If we got here, then the sorted output file was missing for this # subjob # Parse the log file: no_out_line = False glide_sort_started = False glide_sort_command = None glide_sort_succeeded = False # Output file is missing but log file is in place: # Parse the log file to see if Glide produced any structures: for line in pipeutils.BackwardsReader(logfile): # Ev:92076 for the "neither output file" test: if 'no output lig structures' in line or 'No Ligand Poses were written' in line or 'No report data collected' in line or 'glide_sort command succeeded, but neither output file' in line: no_out_line = True if 'Glide is executing the glide_sort command' in line: glide_sort_started = True if 'utilities/glide_sort' in line or r'utilities\glide_sort' in line: glide_sort_command = line if 'glide_sort command succeeded' in line: glide_sort_succeeded = True if no_out_line: # No output poses return None elif glide_sort_succeeded: msg = "Glide_sort succeeded, but could not locate output file" self.error(msg) raise RuntimeError(msg) elif glide_sort_started: # Poses were produced and glide_sort was launched msg = 'Glide produced structures but output file could not be located' self.error(msg) if not glide_sort_command: msg = "No glide_sort command in %s file" % logfile raise RuntimeError(msg) if not os.path.isfile(glide_jobname + '_raw.mae') \ and not os.path.isfile(glide_jobname + '_raw.maegz') \ and not os.path.isfile(glide_jobname + '_raw.mae.gz'): msg = "Raw file is missing! Cannot run glide_sort." raise RuntimeError(msg) # Try to re-run glide_sort: self.info("Running glide_sort to sort the output poses...") self.info('\nRUNNING CMD: %s' % glide_sort_command) os.system(glide_sort_command) # Check whether glide_sort produced a valid output file: outfile = self._find_subjob_output(glide_jobname) if outfile: self.info("\nSorted poses file: %s\n" % outfile) return outfile msg = "glide_sort failed for subjob: %s" % glide_jobname raise RuntimeError(msg) else: # Invalid log file; neither line present msg = "Glide output file missing and log file is incomplete for subjob: %s" % glide_jobname raise RuntimeError(msg)
def _find_subjob_output(self, glide_jobname): if self.out_pv: outfile = glide_jobname + '_pv.mae' else: outfile = glide_jobname + '_lib.mae' if os.path.isfile(outfile): return outfile elif os.path.isfile(outfile + 'gz'): return outfile + 'gz' elif os.path.isfile(outfile + '.gz'): return outfile + '.gz' else: return None
[docs] def buildDockedLigandsMap(self): """ Builds up a dict of self.docked_ligands from the job names in self.glide_jobnames. If self._force_jobs is False, the job fails in the event of subjob failure. Otherwise, the errors are logged and the job continues. """ self.docked_ligands = {} failed_jobnames = [] for glide_jobname in self.glide_jobnames: try: outfile = self.check_subjob_output(glide_jobname) except RuntimeError as err: # Log file could not be read self.error(str(err)) failed_jobnames.append(glide_jobname) continue else: # Log file read OK if outfile is None: msg = 'WARNING: Glide produced no output structures for job: %s' % glide_jobname self.warning(msg) self.docked_ligands[glide_jobname] = None else: self.docked_ligands[glide_jobname] = outfile if failed_jobnames: if self._force_jobs: num_failures = len(failed_jobnames) num_total = len(self.glide_jobnames) for failed_jobname in failed_jobnames: failure_msg = "WARNING: %s failed" % failed_jobname self.warning(failure_msg) msg = ("WARNING: %i subjobs failed out of a total of %i. " "Continuing with job because '-force' option is " "selected. " % (num_failures, num_total)) self.warning(msg) else: self.dump() self.exit("Some jobs failed. Try restarting. Exiting...") self.status = "PROCESSING FILES" self.dump()
[docs] def operate(self): """ Perform an operation on the input files. """ if self.status == "NOT STARTED" or \ self.status == "PREPARING INPUT FILES": # Use LIGAND_CONFS (old name for KOCKING_METHOD) if specified: # Ev:68426 # Get DOCKING_METHOD keyword (or LIGAND_CONFS, old name): if self['LIGAND_CONFS']: # Old keyword specified: self['DOCKING_METHOD'] = self['LIGAND_CONFS'] if not self['DOCKING_METHOD']: self['DOCKING_METHOD'] = 'confgen' # Use defualt self.status = "PREPARING INPUT FILES" self.recombineInputLigands() self.status = "SETTING UP JOBS" self.dump() if self.status == "SETTING UP JOBS": self.info("Setting up Glide jobs...") sys.stdout.flush() self.setupJobs() self.status = "RUNNING JOBS" # key: jobname; value: output file (Will be None if the job failed # to produce valid poses). self.docked_ligands = {} self.dump() if self.status == "RUNNING JOBS": # Update JobDJ to correct options (may change if restarting): self.setJobDJOptions(self.jobdj) # Ev:64286 if restarting, do not restart jobs that have output files: # FIXME Consider subjobs that completed without poses if self.jobdj.hasStarted(): for i, job in enumerate(self.jobdj.all_jobs): glide_jobname = self.glide_jobnames[i] self.debug('Looking for output of job: %s' % (glide_jobname)) try: outfile = self.check_subjob_output(glide_jobname) except RuntimeError as err: # Log file could not be read self.error(str(err)) self.info(' Marking for restart') self.jobdj.markForRestart(job, "no-output") else: # Output file found (or job produced no poses) self.info('Output is okay; marking as completed: %s' % (glide_jobname)) if outfile is None: # Subjob produced no results self.info(' Job produced no poses: %s' % outfile) else: self.info(' Output file: %s' % outfile) job._markFinished('found-output') self.info("Running Glide jobs...") sys.stdout.flush() self.runJobDJ(self.jobdj) # ALL JOBS ARE COMPLETE maybe some have failed but not more than # max_failures. self.buildDockedLigandsMap() if self.status == "PROCESSING FILES": docked_ligand_files = [] for filename in self.docked_ligands.values(): if filename is not None: # If at least one ligand from this subjob docked: docked_ligand_files.append(filename) self.checkFiles(docked_ligand_files) self.processJobOutputs() self.status = "CLEANING UP" self.dump() if self.status == "CLEANING UP": if not self.getCleanupRequested(): self.status = "COMPLETE" return self.info("Cleaning up...") archive_files = [] for jobname in self.glide_jobnames: # Remove these files: for ext in ['_raw.mae', '_raw.mae.gz', '_raw.maegz']: filename = jobname + ext if os.path.isfile(filename): fileutils.force_remove(filename) # Remove these files only if the user did not ask for them to # be returned (or not generating codes): if not self.outputRequested(2) or not self.gencodes: for ext in ['_in.mae', 'in.maegz']: filename = jobname + ext if os.path.isfile(filename): fileutils.force_remove(filename) # Archive these files: for ext in ['.inp', '.log', '.rept', '.out', '.xpdes']: filename = jobname + ext if os.path.isfile(filename): archive_files.append(filename) # Compress these files individually: # NOTE: Do NOT compress *maegz files as they are already # compressed. for ext in ['_pv.mae', '_lib.mae']: filename = jobname + ext if os.path.isfile(filename): self.info(' Compressing %s...' % filename) compress(filename) if archive_files: # There are files to archive: # Create an archive for log files: tar_file = self.genFileName(extension="_logs.tar.gz") log_tar = tarfile.open(tar_file, 'w:gz') for filename in archive_files: self.info(' Archiving %s...' % filename) log_tar.add(filename) # Close the archive file: log_tar.close() # Remove the files that were added to the archive: for filename in archive_files: fileutils.force_remove(filename) # Ev:94264 Add the file to the stage's job record: self.addOutputFile(tar_file) self.status = "COMPLETE" self.dump() if self.status == "COMPLETE": return raise RuntimeError("End of JRStage.operate and self.status = " + self.status)
[docs]class GridgenStage(stage.Stage):
[docs] def __init__(self, *args, **kwargs): specs = """ RECEP_VSCALE = float(default=None) RECEP_CCUT = float(default=None) XCENT = float(default=None) YCENT = float(default=None) ZCENT = float(default=None) GRID_CENTER = float_list(default=None) INNERBOX = integer_list(default=None) OUTERBOX = float_list(default=None) WRITEZIP = boolean(default=False) # Write out compressed grids PEPTIDE = boolean(default=None) # Create grid optimized for polypeptides HBOND_CONSTRAINTS = string_list(default=None) # H-bond constraints METAL_CONSTRAINTS = string_list(default=None) # Metal constraints POSIT_CONSTRAINTS = string_list(default=None) # Positional constraints NOE_CONSTRAINTS = string_list(default=None) # NOE Positional constraints """ stage.Stage.__init__(self, specs=specs, *args, **kwargs) self.setMainProduct('glide') self.addExpectedInput(1, "structures", True) self.addExpectedOutput(1, "grid", True) self.addExpectedOutput(2, "text", False) self.jobdj = None self.model_file = "" self.jobname = None self.grid_files = [] self.status = "NOT STARTED"
[docs] def operate(self): """ Perform an operation on the input files. """ enter_status = self.status if self.status == "NOT STARTED" or \ self.status == "SETTING UP JOBS": self.status = "SETTING UP JOBS" self.info("Setting up the Gridgen job") model_file = self.getInput(1).getFiles()[0] if not model_file: raise RuntimeError( "Please specify a model file to generate grids from.") self.model_file = model_file self.setupJobs() self.status = "RUNNING JOBS" self.grid_file = None self.dump() if self.status == "RUNNING JOBS": # Update JobDJ to correct options (may change if restarting): self.setJobDJOptions(self.jobdj) if enter_status == "RUNNING JOBS": self.info("Trying to restart jobs...") sys.stdout.flush() self.runJobDJ(self.jobdj) if self['WRITEZIP']: self.grid_file = self.jobname + '.zip' self.checkFile(self.grid_file) else: self.grid_file = self.jobname + '.grd' grid_files = [ self.jobname + "_coul2.fld", self.jobname + ".csc", self.jobname + ".grd", self.jobname + "_greedy.save", self.jobname + ".gsc", self.jobname + ".save", self.jobname + ".site", self.jobname + "_vdw.fld" ] self.checkFiles(grid_files) self.info("processing job outputs...") sys.stdout.flush() time.sleep(1) grid = pipeio.Grid(self.grid_file) try: grid.check() except: raise RuntimeError( "Grid generation job did not complete successfully!") self.info(" Output grid file: %s" % self.grid_file) sys.stdout.flush() self.setOutput(1, grid) self.info(" processing complete.") sys.stdout.flush() self.status = "COMPLETED" self.dump() return
[docs] def setupJobs(self): outname = self.getOutputName(1) self.jobname = outname # FIXME move to using the new GRID_CENTER keyword keynames = [ 'WRITEZIP', 'RECEP_VSCALE', 'RECEP_CCUT', 'XCENT', 'YCENT', 'ZCENT', 'GRID_CENTER', 'INNERBOX', 'OUTERBOX', 'PEPTIDE' ] keywords = {} keywords['JOBNAME'] = self.jobname keywords['RECEP_FILE'] = self.model_file # Add constraitns keywords: keynames += [ 'HBOND_CONSTRAINTS', 'METAL_CONSTRAINTS', 'POSIT_CONSTRAINTS', 'NOE_CONSTRAINTS' ] # Setup keywords: for key in keynames: value = self[key] if value is not None: keywords[key] = value # Write the input file: input_file = glide.Gridgen(keywords).writeSimplified() self.checkFile(input_file) sys.stdout.flush() cmd = [glide_path, input_file] self.info( "Starting glide grid generation...(estimated cpu time 10 minutes)") sys.stdout.flush() self.jobdj = self.getJobDJ() # FIXME self.jobdj.setOptions(verbosity="normal") self.jobdj.addJob(cmd)
[docs]class MergeStage(stage.Stage): """ Stage for merging up to 999 Glide output files with 'glide_ensemble_merge'. The input files can be either PV or LIB, and have to be sorted by GlideScoreglide. The output file will also be sorted, and the GlideScores can be offset by a specified value for each input file. The keywords specific to this stage are... NREPORT Save only the top NREPORT poses. The default is 0, which retains all poses. MAXPERLIG Retain at most MAXPERLIG poses for each ligand (determined by the structure title). The default is 1. Use 0 to retain all poses. UNIQUEFIELD Property to uniquely identify compounds by. Default is title. OFFSETS List of GlideScore offsets (float) to be applied to the input structures. First item in the list is the offset for the first input, second is for second input, etc. If not specified, all are defaulted to 0.0. MARKERS List of string lables for each receptor. Each ligand will be marked by this value depending on which input set it originated from. The stage takes up to 100 input structure file sets (PV or LIB) and generates a single output pose file. """
[docs] def __init__(self, *args, **kwargs): """ See class docstring. """ specs = """ NREPORT = integer(default=0) # Default is unlimited (0) MAXPERLIG = integer(default=1) UNIQUEFIELD = string(default="s_m_title") OFFSETS = float_list(default=None) MARKERS = string_list(default=None) """ stage.Stage.__init__(self, specs=specs, *args, **kwargs) for i in range(1, 1000): self.addExpectedInput(i, 'structures', i < 2) self.addExpectedOutput(1, "structures", True) self._job_outputs = []
[docs] def operate(self): """ Run the 'glide_ensemble_merge' script on all the input files from all sets, optionally applying GlideScore offsets to some sets. Generates a single output pose file. """ # Fill offset_dict with all input files and offsets: offset_dict = {} marker_dict = {} offsets = self['OFFSETS'] markers = self['MARKERS'] uniquefield = self['UNIQUEFIELD'] inputs = {} # key: INPUT number, value: list of ligand files i = 0 while True: i += 1 inputobj = self.getInput(i) if not inputobj: break # No more inputs ligfiles = inputobj.getFiles() inputs[i] = ligfiles if offsets: if len(offsets) != len(inputs): msg = "Number of OFFSETS (%i) != number of INPUTs (%i)" % \ (len(offsets), len(inputs)) self.error(msg) raise RuntimeError(msg) else: offsets = [0.0 for inp in inputs] if markers: if len(markers) != len(inputs): msg = "Number of MARKERS (%i) != number of INPUTs (%i)" % \ (len(markers), len(inputs)) self.error(msg) raise RuntimeError(msg) else: markers = [str(inp) for inp in inputs] for i, ligfiles in inputs.items(): offset = offsets[i - 1] # 0-indexed marker = markers[i - 1] # 0-indexed self.debug("LIGANDS #%i: %s" % (i, ligfiles)) self.debug("OFFSET #%i: %f" % (i, offset)) self.debug("MARKER #%i: %s" % (i, marker)) foundligs = False for ligfile in ligfiles: foundligs = True offset_dict[ligfile] = offset marker_dict[ligfile] = marker if not foundligs: self.warning("WARNING: No structure files for input pin %i" % i) if not offset_dict: msg = "ERROR: No structure files were found to merge" raise RuntimeError(msg) # Ru glide_ensemble_merge on all the input files: gem_jobname = self.stagename args = ['-j', gem_jobname] nr = self['NREPORT'] # If not specified, use default of 'unlimited' if nr: args.extend(['-n', str(nr)]) mpl = self['MAXPERLIG'] # If not specified, use default of 1. if mpl: args.extend(['-m', str(mpl)]) for ligfile, offset in offset_dict.items(): marker = marker_dict.get(ligfile) a = '%s:%f:%s' % (ligfile, offset, marker) args.append(a) args += ["-unique_field", uniquefield] schro = os.environ['SCHRODINGER'] script = os.path.join(schro, 'utilities', 'glide_ensemble_merge') cmd = [script] + args # Run with the -epv option, which adds special markers pairing # receptors and poses and merges them into a single _epv.maegz file # See VSW-860 cmd.append("-epv") self.info("Running glide_ensemble_merge command:") self.info("%s" % cmd) self.info("...") gemlog = 'glide_ensemble_merge.log' verbosity = self.getVerbosity() if verbosity == 'quiet': fh = open(gemlog, 'w') subprocess.call(cmd, stdout=fh) fh.close() else: subprocess.call(cmd) self.info(" complete.") # Rename glide_ensemble_merge outputs: merged_file = gem_jobname + '_epv.maegz' msg = "glide_ensemble_merge did not produce an output:" self.checkFile(merged_file, msg) renamed_outfile = self.genOutputFileName(1, extension="_epv.maegz") shutil.move(merged_file, renamed_outfile) self.checkFile(renamed_outfile, "Failed to move file to:") self.debug("Merged poses file: " + renamed_outfile) if os.path.isfile(gemlog): fileutils.force_remove(gemlog) self.setOutput(1, pipeio.Structures([renamed_outfile]))
[docs]class GlideSortStage(stage.Stage): """ Stage for sorting Glide _pv/lib.mae files. Currently, a sorted file is produced for each input file; that is, the input files are merged and sorted together. The keywords specific to this stage are... BLOCK_SORT_BY_TITLE Whether to block-sort the poses by title. If True (the default), all poses with the same title are grouped and sorted by the INTRATITLE_SORT_KEYS keys. The blocks themselves are sorted by applying the SORT_KEYS to just the top pose in each block. INTRATITLE_SORT_KEYS The sort key(s) within a title block. The default is 'r_i_glide_emodel' (appropriate for Glide HTVS/SP results). Multiple sort keys must be whitespace-separated. SORT_KEYS The key(s) for sorting the pose file. If BLOCK_SORT_BY_TITLE is True, the blocks are sorted by the values of their top pose's SORT_KEYS properties. Multiple sort keys must be whitespace-separated. OUTPUT_POSES_PER_TITLE The number of top poses per title to keep. Applies only if BLOCK_SORT_BY_TITLE is true. The default is 0, which means retain all poses. INPUT_TYPE Specify whether the receptor is ('pv') or is not ('lib') in the file. By default, the type will be determined from the file name (i.e., _pv/lib.mae). """
[docs] def __init__(self, *args, **kwargs): """ See class docstring. """ specs = """ BLOCK_SORT_BY_TITLE = boolean(default=True) INTRATITLE_SORT_KEYS = string(default="r_i_glide_emodel") SORT_KEYS = string(default="r_i_docking_score") OUTPUT_POSES_PER_TITLE = integer(default=0) INPUT_TYPE = string(default=None) """ stage.Stage.__init__(self, specs=specs, *args, **kwargs) self.addExpectedInput(1, "structures", True) self.addExpectedOutput(1, "structures", True)
[docs] def operate(self): """ Sort each input pose file using the application.glide.glideanalysis module. Generates a sorted output pose file for each input pose file. """ ligfiles = self.getInput(1).getFiles() outfiles = [] for index, ligfile in enumerate(ligfiles): outfile = self.genOutputFileName(1, filenum=index + 1, extension=".mae") outfiles.append(outfile) # NOTE: the PoseLibrary class is not yet maegz-aware pl = glideanalysis.PoseLibrary( file_name=ligfile, file_type=self['INPUT_TYPE'], sort_property_1=self['SORT_KEYS'], sort_property_2=self['INTRATITLE_SORT_KEYS'], block_sort=self['BLOCK_SORT_BY_TITLE']) if (self['OUTPUT_POSES_PER_TITLE'] < 1 or not self['BLOCK_SORT_BY_TITLE']): # Write all poses indices = [] for item in pl.title_tuples: indices.extend(item[:]) pl.write(outfile, indexes=indices) else: pl.writeTopPoses( output_lib_name=outfile, max_pose_per_title=self['OUTPUT_POSES_PER_TITLE']) self.setOutput(1, pipeio.Structures(outfiles))
# EOF