Source code for schrodinger.pipeline.stages.gencodes

"""
Stage for generating and storing codes for ligands/compounds and variants.

It also recombined input ligands into a number of subjobs specified via -NJOBS
argument (If -adjust is used, this number is also adjusted based on the
MIN_SUBJOB_STS and MAX_SUBJOB_STS keywords.

Copyright Schrodinger, LLC. All rights reserved.

"""
# Contributors: Matvey Adzhigirey

import os
import sys

from schrodinger import structure
from schrodinger.pipeline import pipeio
from schrodinger.pipeline import pipeutils
from schrodinger.pipeline import stage
from schrodinger.utils.fileutils import SMILES
from schrodinger.utils.fileutils import SMILESCSV
from schrodinger.utils.fileutils import get_structure_file_format
from schrodinger.utils.fileutils import is_poseviewer_file


[docs]class RecombineStage(stage.Stage): """ Stage for generating compound and variant codes, which are stored as properties. It also recombined input ligands into a number of subjobs specified via -NJOBS argument (If -adjust is used, this number is also adjusted based on the MIN_SUBJOB_STS and MAX_SUBJOB_STS keywords. See the specs for the keywords specific to this stage. A ligand/compound is identified by the UNIQUEFIELD property, and structures with the same value of this property are indexed as variants. If UNIQUEFIELD is not specified, each structure is treated as a compound (i.e., there are no variants), and the label for the compound is just the structure index number as all the files and structures are read. The output structures will have their OUTCOMPOUNDFIELD set to their compound label (either the UNIQUEFIELD property value or the structure index). If OUTCOMPOUNDFIELD is "s_m_title", the original titles will be saved to the 's_pipeline_title_backup' property. OUTVARIANTFIELD property ('s_pipeline_variant' by default) is set to "<compound_label>-<variant_index>". The stage takes one input structure file set and generates one set of output structure files, each containing about 100,000 structures. """
[docs] def __init__(self, *args, **kwargs): """ See class docstring. """ specs = """ GENCODES = boolean(default=True) # Whether to generate compound and variant codes. NUMOUT = option("preserve", "njobs", "fewest", default="preserve") # How to generate the number of output files. "preserve" - same as the input; "fewest" - as few output files as possible; "njobs" - derive from -NJOBS option. PRESERVE_NJOBS = boolean(default=False) # If not recombining, UNIQUEFIELD = string(default="NONE") # Property that is unique to each compound. If "NONE", then each structure is unique. OUTCOMPOUNDFIELD = string(default="s_pipeline_compound_code") # Field to put the compound codes into (if GENCODES is True). OUTVARIANTFIELD = string(default="s_pipeline_variant") # Field to put variant codes into (if GENCODES is True) OUTFORMAT = option("maegz", "mae", "sdf", default="maegz") # Output format SKIP_BAD_LIGANDS = boolean(default=True) SKIP_RECEPTOR = boolean(default=True) # Whether to remove receptors from the set (PV files only) MIXLIGS = boolean(default=False) # Whether to re-shuffle the ligands between subjobs. SKIP_NOUNIQUE_LIGANDS = boolean(default=False) # Whether to skip ligands that have no unique field. MIN_SUBJOB_STS = integer(default=None) # Minimum subjob size MAX_SUBJOB_STS = integer(default=None) # Maximum subjob size """ stage.Stage.__init__(self, specs=specs, *args, **kwargs) self.addExpectedInput(1, "structures", True) self.addExpectedOutput(1, "structures", True)
[docs] def operate(self): """ Generates ligand compound and variant codes, and saves the structures into fewer number of files (max 100,000 per file, except a new output file must start with a new compound). Adds the compound label, compound-and-variant, and original title to the output structures as properties. Raises a RuntimeError if the output file format is invalid, if there is a problem reading an input file or writing an output file, or if the UNQIUEFIELD property doesn't exist for a structure. """ """ Read all input structures, while optionally skipping bad structures, and save each structure into a subjob input file. Also OUTCOMPOUNDFIELD is set to the property specified by UNIQUEFIELD. If UNIQUEFIELD is set to NONE, then OUTCOMPOUNDFIELD is set to a number which is unique for each input ligand. If UNIQUEFIELD is NONE, then OUTCOMPOUNDFIELD is determined either from UNIQUEFILED property, or each structure is assumed to be its own compound. If OUTCOMPOUNDFIELD is "s_m_title", then the original title is backed up into the "s_pipeline_title_backup" property. Structures can also be shuffled within subjobs in order to make subjob lengths more uniform if MIXLIGS is set to True. """ self.input_files = self.getInput(1).getFiles() outformat = self['OUTFORMAT'] if self['NUMOUT'] == 'njobs': self.info( "The number of output files will be derived from -NJOBS value") numout_njobs = True preserve_njobs = False min_job_size = self['MIN_SUBJOB_STS'] max_job_size = self['MAX_SUBJOB_STS'] elif self['NUMOUT'] == 'preserve': self.info("Will preserve the number of files") numout_njobs = False preserve_njobs = True else: self.info("Will create as few output files as possible") numout_njobs = False preserve_njobs = False if self['MIXLIGS']: self.warning('WARNING: Keyword MIXLIGS is no longer supported') gencodes = self['GENCODES'] # These keywords are ignored if not generating codes: uniquefield = self['UNIQUEFIELD'] variantfield = self['OUTVARIANTFIELD'] compoundfield = self['OUTCOMPOUNDFIELD'] # Make sure the keywords are specified properly: if outformat not in ["mae", "maegz", "sdf"]: raise RuntimeError('Output Format needs to be "mae" or "sdf"!') if not gencodes: # Don't generate codes for keyword in [ 'UNIQUEFIELD', 'OUTVARIANTFIELD', 'OUTCOMPOUNDFIELD' ]: if self[keyword]: self.warning("GENCODES is False; ignoring %s keyword" % keyword) if numout_njobs: # If NUMOUT was set to "njobs", we need to count the number of # input structures: if hasattr(self, 'input_st_num'): # already counted # Already calculated self.info( "Number of input structures was already calculated: %i" % self.input_st_num) else: # FIXME If input number of structures is know, use it self.info("Counting number of input structures") input_st_num = 0 for filename in self.input_files: try: input_st_num += structure.count_structures(filename) except: self.exit("ERROR Could not get st count for file: " + filename) self.input_st_num = input_st_num self.dump() # Save the instance self.info("Number of input structures: %i" % self.input_st_num) # Determine the number of subjobs to split the ligand set into: (n_st_per_file, njobs, adjusted) = self.getAdjustedNJobs(self.input_st_num, min_job_size, max_job_size) if adjusted: self.info( "Adjusting number of jobs to yield %i-%i ligands per job" % (min_job_size, max_job_size)) self.info("Will generate %i subjobs" % njobs) elif preserve_njobs: njobs = len(self.input_files) self.info("Separating ligands into %i subjobs." % njobs) self.outfile_compounds = {} # Number of compounds in each input job self.outfile_structures = {} # Number of structures in each input job recombined_writers_dict = {} # keys are filenums # The compound code of the previous structure: current_compound_id = None current_compound_id_variant = 0 number_of_compounds = 0 in_st_num = 0 # Processing ligand # output_structure_count = 0 max_file_size = 100000 # Just less than 2GB if gencodes: self.info(" Will generate ligand compound and variant codes...") if uniquefield == "NONE": self.info( " Each ligand will be treated as a unique compound.") else: self.info(" Unique field: " + uniquefield) sys.stdout.flush() self.info("\nProcessing ligand files...") output_files = [] curr_writer = None prev_filenum = None filenum = 0 if numout_njobs: dp = pipeutils.DotPrinter(self.input_st_num) else: dp = pipeutils.DotPrinter() num_skipped_sts = 0 in_st_num = 0 infilenum = 0 for infile in self.input_files: infilenum += 1 iformat = get_structure_file_format(infile) self.debug("\nReading file: %s (format %s)" % (infile, iformat)) # Will open unknown files as SD: if iformat == SMILES: sts = structure.SmilesReader(infile) elif iformat == SMILESCSV: sts = structure.SmilesCsvReader(infile) else: sts = pipeutils.get_reader(infile) if iformat in [SMILES, SMILESCSV]: if compoundfield != 's_m_title': msg = "ERROR: Could not save compound codes to property '%s' because the input structure is in SMILES format" % compoundfield self.exit(msg) st_file_pos = 0 # Location of this structure in the input file while True: # Iterate until StopIteration: # For each structure in this file: try: st = next(sts) except StopIteration: break # EOF, exit except Exception: # Bad structure if self['SKIP_BAD_LIGANDS']: num_skipped_sts += 1 in_st_num += 1 st_file_pos += 1 continue # Skip the bad structure else: msg = "ERROR: Could not read next structure from file: %s" % infile msg += "Try setting SKIP_BAD_LIGANDS to True" self.exit(msg) else: # Read OK in_st_num += 1 # If got here, then read the structure okay st_file_pos += 1 is_receptor = False # Skip first structure if PV file is specified: if st_file_pos == 1 and is_poseviewer_file(infile): if self['SKIP_RECEPTOR']: continue # Skip the first structure (receptor) else: is_receptor = True # Print progress period: dp.dot() if not is_receptor: # Set the compound code: if uniquefield == 'NONE': compound_id = str(in_st_num) else: try: # Ev:96648 & Ev: 104439 compound_id = pipeutils.read_unique_field( st, uniquefield) except KeyError: msg = "ERROR: File %s; ligand %i is missing property: %s" % ( os.path.basename(infile), st_file_pos, uniquefield) if self['SKIP_NOUNIQUE_LIGANDS']: self.warning(msg) continue # Go to next input ligand else: self.exit(msg) # Figure out if it's a new compound or not, and if it's a # new variant or not: if compound_id != current_compound_id or current_compound_id is None: # New compound: current_compound_id_variant = 1 number_of_compounds += 1 current_compound_id = compound_id else: # Another variant of the same compound: current_compound_id_variant += 1 if gencodes: if self['OUTCOMPOUNDFIELD'] == 's_m_title': if iformat not in [SMILES, SMILESCSV]: st.property[ "s_pipeline_title_backup"] = st.title # Will work for SmilesStructures also st.title = compound_id else: # FIXME will not work for SmilesStructures: st.property[self['OUTCOMPOUNDFIELD']] = compound_id # Set the variant code: s = "%s-%i" % (compound_id, current_compound_id_variant) st.property[variantfield] = s # Figure out which file to write to: if is_receptor: if filenum == 0: filenum = 1 else: pass # write to the same file # If the compound_id (compound code) for this ligand is the same as # for the previous ligand, write them to the same subjob file. elif current_compound_id_variant == 1: # This compound is different from a previous compound. # (will also happen for the first structure) # Consider writing it to a different file. if filenum == 0: filenum = 1 elif preserve_njobs: # "preserve" # Requested to keep number of subjobs the same: # Put this structure into the same file as it came # from: filenum = infilenum elif numout_njobs: # "njobs" if self.outfile_structures[filenum] >= n_st_per_file: filenum += 1 else: # "fewest" # Requested to keep number of output files to a # minimum: if filenum == 0: filenum = 1 elif self.outfile_structures[filenum] >= max_file_size: filenum += 1 # Record number of compounds that is in this output file: try: self.outfile_compounds[filenum] += 1 except KeyError: self.outfile_compounds[filenum] = 1 if filenum != prev_filenum: prev_filenum = filenum if iformat == SMILES: outfile = self.genFileName(filenum=filenum, extension="-in.smi") elif iformat == SMILESCSV: outfile = self.genFileName(filenum=filenum, extension="-in.csv") else: outfile = self.genFileName(filenum=filenum, extension="-in.maegz") if curr_writer: # Close the previous file handle VSW-891 curr_writer.close() curr_writer = structure.StructureWriter(outfile) # Sanity check: assert outfile not in output_files output_files.append(outfile) self.outfile_structures[filenum] = 0 # Write the output structure to file: curr_writer.append(st) self.outfile_structures[filenum] += 1 output_structure_count += 1 if in_st_num < 1: self.exit( "\nInput file did not contain any readable structures: " + infile) if output_structure_count > 0: # Close the last file handle: curr_writer.close() output_compound_count = sum(self.outfile_compounds.values()) # Print summary: self.info('') # To have a new line after the progress periods self.info("Number of input structures: %i" % (in_st_num)) self.info("Number of processed structures: %i" % output_structure_count) self.info("Number of processed compounds: %i" % output_compound_count) if num_skipped_sts: self.warning( "\n%i structures were skipped because they were invalid." % num_skipped_sts) self.info("Number of output files: %i" % len(output_files)) if True: # Report to the log file the numbers of structure per each subjob: num_list = [] for i, numposes in self.outfile_structures.items(): num_compounds = self.outfile_compounds[i] # NUM:(ROOTS, STRUCTUES) num_list.append("%i:(%i, %i)" % (i, num_compounds, numposes)) msg = "\nNumber of (compounds, structures) for each output file:" msg += "\n%s" % ", ".join(num_list) self.info(msg) self.setOutput(1, pipeio.Structures(output_files, output_structure_count))
# Ev:78695
[docs]class RestoreTitlesStage(stage.Stage): """ Stage for copying titles for each input ligand from the 's_pipeline_title_backup' property to the 's_m_title' property. The stage takes one input structure file set and generates one set of output structure files, each containing about 100,000 structures. """
[docs] def __init__(self, *args, **kwargs): """ See class docstring. """ specs = """ SOURCE_FIELD = string(default="s_pipeline_title_backup") DESTINATION_FIELD = string(default="s_m_title") """ stage.Stage.__init__(self, specs=specs, *args, **kwargs) self.addExpectedInput(1, "structures", True) self.addExpectedOutput(1, "structures", True)
[docs] def operate(self): """ For each structure, sets the value of the DESTINATION_FIELD to the the value of the SOURCE_FIELD. If no such value exists, raises a RuntimeError. """ self.input_files = self.getInput(1).getFiles() self.source_field = self['SOURCE_FIELD'] self.destination_field = self['DESTINATION_FIELD'] self.info('Will copy values from property "%s" to property "%s"' % (self.source_field, self.destination_field)) self.info("\nProcessing ligand files...") dp = pipeutils.DotPrinter() writer = structure.MultiFileStructureWriter(self.genOutputFileName(1), extension=".maegz") st_num = 0 for ligfile in self.input_files: for st in pipeutils.get_reader(ligfile): st_num += 1 # Print progress period: dp.dot() try: value = st.property[self.source_field] except KeyError: raise RuntimeError( 'Error reading property "%s" for structure %i of file "%s"' + (self.source_field, st_num, ligfile)) try: st.property[self.destination_field] = value except: raise RuntimeError( 'Error setting property "%s" for structure %i of file "%s"' + (self.destination_field, st_num, ligfile)) writer.append(st) writer.close() output_files = writer.getFiles() self.setOutput(1, pipeio.Structures(output_files, st_num)) # Print summary: self.info('') # To have a new line after the progress periods self.info("Number of processed structures: %i" % (st_num)) self.info("Number of output files: %i" % len(output_files))
# EOF