"""
Stage for generating and storing codes for ligands/compounds and variants.
It also recombined input ligands into a number of subjobs specified via -NJOBS
argument (If -adjust is used, this number is also adjusted based on the
MIN_SUBJOB_STS and MAX_SUBJOB_STS keywords.
Copyright Schrodinger, LLC. All rights reserved.
"""
# Contributors: Matvey Adzhigirey
import os
import sys
from schrodinger import structure
from schrodinger.pipeline import pipeio
from schrodinger.pipeline import pipeutils
from schrodinger.pipeline import stage
from schrodinger.utils.fileutils import SMILES
from schrodinger.utils.fileutils import SMILESCSV
from schrodinger.utils.fileutils import get_structure_file_format
from schrodinger.utils.fileutils import is_poseviewer_file
[docs]class RecombineStage(stage.Stage):
"""
Stage for generating compound and variant codes, which are stored as
properties.
It also recombined input ligands into a number of subjobs specified via
-NJOBS argument (If -adjust is used, this number is also adjusted based
on the MIN_SUBJOB_STS and MAX_SUBJOB_STS keywords.
See the specs for the keywords specific to this stage.
A ligand/compound is identified by the UNIQUEFIELD property, and
structures with the same value of this property are indexed as
variants. If UNIQUEFIELD is not specified, each structure is treated as
a compound (i.e., there are no variants), and the label for the compound
is just the structure index number as all the files and structures are
read.
The output structures will have their OUTCOMPOUNDFIELD set to their compound
label (either the UNIQUEFIELD property value or the structure index). If
OUTCOMPOUNDFIELD is "s_m_title", the original titles will be saved to the
's_pipeline_title_backup' property.
OUTVARIANTFIELD property ('s_pipeline_variant' by default) is set to
"<compound_label>-<variant_index>".
The stage takes one input structure file set and generates one set of
output structure files, each containing about 100,000 structures.
"""
[docs] def __init__(self, *args, **kwargs):
"""
See class docstring.
"""
specs = """
GENCODES = boolean(default=True) # Whether to generate compound and variant codes.
NUMOUT = option("preserve", "njobs", "fewest", default="preserve") # How to generate the number of output files. "preserve" - same as the input; "fewest" - as few output files as possible; "njobs" - derive from -NJOBS option.
PRESERVE_NJOBS = boolean(default=False) # If not recombining,
UNIQUEFIELD = string(default="NONE") # Property that is unique to each compound. If "NONE", then each structure is unique.
OUTCOMPOUNDFIELD = string(default="s_pipeline_compound_code") # Field to put the compound codes into (if GENCODES is True).
OUTVARIANTFIELD = string(default="s_pipeline_variant") # Field to put variant codes into (if GENCODES is True)
OUTFORMAT = option("maegz", "mae", "sdf", default="maegz") # Output format
SKIP_BAD_LIGANDS = boolean(default=True)
SKIP_RECEPTOR = boolean(default=True) # Whether to remove receptors from the set (PV files only)
MIXLIGS = boolean(default=False) # Whether to re-shuffle the ligands between subjobs.
SKIP_NOUNIQUE_LIGANDS = boolean(default=False) # Whether to skip ligands that have no unique field.
MIN_SUBJOB_STS = integer(default=None) # Minimum subjob size
MAX_SUBJOB_STS = integer(default=None) # Maximum subjob size
"""
stage.Stage.__init__(self, specs=specs, *args, **kwargs)
self.addExpectedInput(1, "structures", True)
self.addExpectedOutput(1, "structures", True)
[docs] def operate(self):
"""
Generates ligand compound and variant codes, and saves the structures into
fewer number of files (max 100,000 per file, except a new output file
must start with a new compound). Adds the compound label,
compound-and-variant, and original title to the output structures as
properties. Raises a RuntimeError if the output file format is
invalid, if there is a problem reading an input file or writing an
output file, or if the UNQIUEFIELD property doesn't exist for a
structure.
"""
"""
Read all input structures, while optionally skipping bad structures,
and save each structure into a subjob input file.
Also OUTCOMPOUNDFIELD is set to the property specified by UNIQUEFIELD.
If UNIQUEFIELD is set to NONE, then OUTCOMPOUNDFIELD is set to a number
which is unique for each input ligand.
If UNIQUEFIELD is NONE, then OUTCOMPOUNDFIELD
is determined either from UNIQUEFILED property, or each structure
is assumed to be its own compound.
If OUTCOMPOUNDFIELD is "s_m_title", then the original title is backed
up into the "s_pipeline_title_backup" property.
Structures can also be shuffled within subjobs in order to make
subjob lengths more uniform if MIXLIGS is set to True.
"""
self.input_files = self.getInput(1).getFiles()
outformat = self['OUTFORMAT']
if self['NUMOUT'] == 'njobs':
self.info(
"The number of output files will be derived from -NJOBS value")
numout_njobs = True
preserve_njobs = False
min_job_size = self['MIN_SUBJOB_STS']
max_job_size = self['MAX_SUBJOB_STS']
elif self['NUMOUT'] == 'preserve':
self.info("Will preserve the number of files")
numout_njobs = False
preserve_njobs = True
else:
self.info("Will create as few output files as possible")
numout_njobs = False
preserve_njobs = False
if self['MIXLIGS']:
self.warning('WARNING: Keyword MIXLIGS is no longer supported')
gencodes = self['GENCODES']
# These keywords are ignored if not generating codes:
uniquefield = self['UNIQUEFIELD']
variantfield = self['OUTVARIANTFIELD']
compoundfield = self['OUTCOMPOUNDFIELD']
# Make sure the keywords are specified properly:
if outformat not in ["mae", "maegz", "sdf"]:
raise RuntimeError('Output Format needs to be "mae" or "sdf"!')
if not gencodes:
# Don't generate codes
for keyword in [
'UNIQUEFIELD', 'OUTVARIANTFIELD', 'OUTCOMPOUNDFIELD'
]:
if self[keyword]:
self.warning("GENCODES is False; ignoring %s keyword" %
keyword)
if numout_njobs:
# If NUMOUT was set to "njobs", we need to count the number of
# input structures:
if hasattr(self, 'input_st_num'): # already counted
# Already calculated
self.info(
"Number of input structures was already calculated: %i" %
self.input_st_num)
else:
# FIXME If input number of structures is know, use it
self.info("Counting number of input structures")
input_st_num = 0
for filename in self.input_files:
try:
input_st_num += structure.count_structures(filename)
except:
self.exit("ERROR Could not get st count for file: " +
filename)
self.input_st_num = input_st_num
self.dump() # Save the instance
self.info("Number of input structures: %i" % self.input_st_num)
# Determine the number of subjobs to split the ligand set into:
(n_st_per_file, njobs,
adjusted) = self.getAdjustedNJobs(self.input_st_num, min_job_size,
max_job_size)
if adjusted:
self.info(
"Adjusting number of jobs to yield %i-%i ligands per job" %
(min_job_size, max_job_size))
self.info("Will generate %i subjobs" % njobs)
elif preserve_njobs:
njobs = len(self.input_files)
self.info("Separating ligands into %i subjobs." % njobs)
self.outfile_compounds = {} # Number of compounds in each input job
self.outfile_structures = {} # Number of structures in each input job
recombined_writers_dict = {} # keys are filenums
# The compound code of the previous structure:
current_compound_id = None
current_compound_id_variant = 0
number_of_compounds = 0
in_st_num = 0 # Processing ligand #
output_structure_count = 0
max_file_size = 100000 # Just less than 2GB
if gencodes:
self.info(" Will generate ligand compound and variant codes...")
if uniquefield == "NONE":
self.info(
" Each ligand will be treated as a unique compound.")
else:
self.info(" Unique field: " + uniquefield)
sys.stdout.flush()
self.info("\nProcessing ligand files...")
output_files = []
curr_writer = None
prev_filenum = None
filenum = 0
if numout_njobs:
dp = pipeutils.DotPrinter(self.input_st_num)
else:
dp = pipeutils.DotPrinter()
num_skipped_sts = 0
in_st_num = 0
infilenum = 0
for infile in self.input_files:
infilenum += 1
iformat = get_structure_file_format(infile)
self.debug("\nReading file: %s (format %s)" % (infile, iformat))
# Will open unknown files as SD:
if iformat == SMILES:
sts = structure.SmilesReader(infile)
elif iformat == SMILESCSV:
sts = structure.SmilesCsvReader(infile)
else:
sts = pipeutils.get_reader(infile)
if iformat in [SMILES, SMILESCSV]:
if compoundfield != 's_m_title':
msg = "ERROR: Could not save compound codes to property '%s' because the input structure is in SMILES format" % compoundfield
self.exit(msg)
st_file_pos = 0 # Location of this structure in the input file
while True: # Iterate until StopIteration:
# For each structure in this file:
try:
st = next(sts)
except StopIteration:
break # EOF, exit
except Exception: # Bad structure
if self['SKIP_BAD_LIGANDS']:
num_skipped_sts += 1
in_st_num += 1
st_file_pos += 1
continue # Skip the bad structure
else:
msg = "ERROR: Could not read next structure from file: %s" % infile
msg += "Try setting SKIP_BAD_LIGANDS to True"
self.exit(msg)
else: # Read OK
in_st_num += 1
# If got here, then read the structure okay
st_file_pos += 1
is_receptor = False
# Skip first structure if PV file is specified:
if st_file_pos == 1 and is_poseviewer_file(infile):
if self['SKIP_RECEPTOR']:
continue # Skip the first structure (receptor)
else:
is_receptor = True
# Print progress period:
dp.dot()
if not is_receptor:
# Set the compound code:
if uniquefield == 'NONE':
compound_id = str(in_st_num)
else:
try:
# Ev:96648 & Ev: 104439
compound_id = pipeutils.read_unique_field(
st, uniquefield)
except KeyError:
msg = "ERROR: File %s; ligand %i is missing property: %s" % (
os.path.basename(infile), st_file_pos,
uniquefield)
if self['SKIP_NOUNIQUE_LIGANDS']:
self.warning(msg)
continue # Go to next input ligand
else:
self.exit(msg)
# Figure out if it's a new compound or not, and if it's a
# new variant or not:
if compound_id != current_compound_id or current_compound_id is None:
# New compound:
current_compound_id_variant = 1
number_of_compounds += 1
current_compound_id = compound_id
else:
# Another variant of the same compound:
current_compound_id_variant += 1
if gencodes:
if self['OUTCOMPOUNDFIELD'] == 's_m_title':
if iformat not in [SMILES, SMILESCSV]:
st.property[
"s_pipeline_title_backup"] = st.title
# Will work for SmilesStructures also
st.title = compound_id
else:
# FIXME will not work for SmilesStructures:
st.property[self['OUTCOMPOUNDFIELD']] = compound_id
# Set the variant code:
s = "%s-%i" % (compound_id, current_compound_id_variant)
st.property[variantfield] = s
# Figure out which file to write to:
if is_receptor:
if filenum == 0:
filenum = 1
else:
pass # write to the same file
# If the compound_id (compound code) for this ligand is the same as
# for the previous ligand, write them to the same subjob file.
elif current_compound_id_variant == 1:
# This compound is different from a previous compound.
# (will also happen for the first structure)
# Consider writing it to a different file.
if filenum == 0:
filenum = 1
elif preserve_njobs:
# "preserve"
# Requested to keep number of subjobs the same:
# Put this structure into the same file as it came
# from:
filenum = infilenum
elif numout_njobs:
# "njobs"
if self.outfile_structures[filenum] >= n_st_per_file:
filenum += 1
else:
# "fewest"
# Requested to keep number of output files to a
# minimum:
if filenum == 0:
filenum = 1
elif self.outfile_structures[filenum] >= max_file_size:
filenum += 1
# Record number of compounds that is in this output file:
try:
self.outfile_compounds[filenum] += 1
except KeyError:
self.outfile_compounds[filenum] = 1
if filenum != prev_filenum:
prev_filenum = filenum
if iformat == SMILES:
outfile = self.genFileName(filenum=filenum,
extension="-in.smi")
elif iformat == SMILESCSV:
outfile = self.genFileName(filenum=filenum,
extension="-in.csv")
else:
outfile = self.genFileName(filenum=filenum,
extension="-in.maegz")
if curr_writer:
# Close the previous file handle VSW-891
curr_writer.close()
curr_writer = structure.StructureWriter(outfile)
# Sanity check:
assert outfile not in output_files
output_files.append(outfile)
self.outfile_structures[filenum] = 0
# Write the output structure to file:
curr_writer.append(st)
self.outfile_structures[filenum] += 1
output_structure_count += 1
if in_st_num < 1:
self.exit(
"\nInput file did not contain any readable structures: " +
infile)
if output_structure_count > 0:
# Close the last file handle:
curr_writer.close()
output_compound_count = sum(self.outfile_compounds.values())
# Print summary:
self.info('') # To have a new line after the progress periods
self.info("Number of input structures: %i" % (in_st_num))
self.info("Number of processed structures: %i" % output_structure_count)
self.info("Number of processed compounds: %i" % output_compound_count)
if num_skipped_sts:
self.warning(
"\n%i structures were skipped because they were invalid." %
num_skipped_sts)
self.info("Number of output files: %i" % len(output_files))
if True:
# Report to the log file the numbers of structure per each subjob:
num_list = []
for i, numposes in self.outfile_structures.items():
num_compounds = self.outfile_compounds[i]
# NUM:(ROOTS, STRUCTUES)
num_list.append("%i:(%i, %i)" % (i, num_compounds, numposes))
msg = "\nNumber of (compounds, structures) for each output file:"
msg += "\n%s" % ", ".join(num_list)
self.info(msg)
self.setOutput(1, pipeio.Structures(output_files,
output_structure_count))
# Ev:78695
[docs]class RestoreTitlesStage(stage.Stage):
"""
Stage for copying titles for each input ligand from the
's_pipeline_title_backup' property to the 's_m_title' property.
The stage takes one input structure file set and generates one set of
output structure files, each containing about 100,000 structures.
"""
[docs] def __init__(self, *args, **kwargs):
"""
See class docstring.
"""
specs = """
SOURCE_FIELD = string(default="s_pipeline_title_backup")
DESTINATION_FIELD = string(default="s_m_title")
"""
stage.Stage.__init__(self, specs=specs, *args, **kwargs)
self.addExpectedInput(1, "structures", True)
self.addExpectedOutput(1, "structures", True)
[docs] def operate(self):
"""
For each structure, sets the value of the DESTINATION_FIELD to the
the value of the SOURCE_FIELD. If no such value exists, raises
a RuntimeError.
"""
self.input_files = self.getInput(1).getFiles()
self.source_field = self['SOURCE_FIELD']
self.destination_field = self['DESTINATION_FIELD']
self.info('Will copy values from property "%s" to property "%s"' %
(self.source_field, self.destination_field))
self.info("\nProcessing ligand files...")
dp = pipeutils.DotPrinter()
writer = structure.MultiFileStructureWriter(self.genOutputFileName(1),
extension=".maegz")
st_num = 0
for ligfile in self.input_files:
for st in pipeutils.get_reader(ligfile):
st_num += 1
# Print progress period:
dp.dot()
try:
value = st.property[self.source_field]
except KeyError:
raise RuntimeError(
'Error reading property "%s" for structure %i of file "%s"'
+ (self.source_field, st_num, ligfile))
try:
st.property[self.destination_field] = value
except:
raise RuntimeError(
'Error setting property "%s" for structure %i of file "%s"'
+ (self.destination_field, st_num, ligfile))
writer.append(st)
writer.close()
output_files = writer.getFiles()
self.setOutput(1, pipeio.Structures(output_files, st_num))
# Print summary:
self.info('') # To have a new line after the progress periods
self.info("Number of processed structures: %i" % (st_num))
self.info("Number of output files: %i" % len(output_files))
# EOF