"""
A stage for extracting a subset of ligands/compounds.
The Pull stage identifies a subset of compounds from one set of ligand
structures, and extracts all variants of those compounds from a second set of
ligand structures.
Copyright Schrodinger, LLC. All rights reserved.
"""
import sys
from schrodinger import structure
from schrodinger.pipeline import pipeio
from schrodinger.pipeline import pipeutils
from schrodinger.pipeline import stage
from schrodinger.utils.fileutils import is_poseviewer_file
[docs]class PullStage(stage.Stage):
"""
Stage for extracting a subset of ligands/compounds (and their variants)
from a second set of ligand files, given the number or percent to keep
from a first set of ligand files. The first set must be ordered; i.e.,
the ligands to keep appear earliest in the set. Compounds in the second
set that don't appear in the first set are ignored.
The keywords specific to this stage are...
UNIQUEFIELD The ligand property that identifies a compound.
Multiple ligands with the same UNIQUEFIELD value
are considered variants of the compound. The
default is the structure title ('s_m_title').
NUM_TO_KEEP The number of compounds from the first set to
extract from the second set. The compounds kept
are those that appear earliest in the first set.
PERCENT_TO_KEEP The percentage of compounds from the first set to
extract from the second set. The compounds kept
are those that appear earliest in the first set.
Ignored if NUM_TO_KEEP is used.
KEEP_CHARGES If set to True, then after the ligand is pulled from
the original file, the partial charges are set to
what they were in the second file (For QPLD).
NOTE: If set, only ONE ligand with specified field
should be in the first file set.
NOTE: solvation_charge property is assummed to be
equal to the partial_charge.
CHARGE_PROPERTY If KEEP_CHARGES is True, the charges are taken from
this atom-level property. Default is 'r_m_charge1'.
These charges from the first set are placed in the
'r_m_charge1' (partial_charge) and 'r_m_charge2'
(solvation_charge) properties of the pulled set, not
the CHARGE_PROPERTY in the pulled set.
If neither NUM_TO_KEEP or PERCENT_TO_KEEP are present, all compounds in
the first set are extracted from the second set. In all cases, every
variant of a kept compound from the second set is extracted.
This stage takes one ordered input structure set that identifies the
ligands to extract, a second input structure set from which those ligands
are extracted, and it generates one output structure file set, each
containing about 100,000 structures.
Issues:
Keeping the first N compounds may not choose compounds from Glide results
in the proper manner. If the variants of a compound are chemically
distinct, it's appropriate to choose the top compounds based on GlideScore.
However, if the variants differ only in their conformations (such as when
saving multiple poses per ligand), Emodel (not GlideScore) should be used
to determine the representative variant of the compound for GlideScore
comparison with other compounds. Hopefully, this doesn't come up
often, because it would be very tricky to reflect the proper ordering
in the input files if there are both Ligprep-type variants and
conformation-variants.
"""
[docs] def __init__(self, *args, **kwargs):
"""
See class docstring.
"""
specs = """
UNIQUEFIELD = string(default="s_m_title")
NUM_TO_KEEP = integer(default=0) # Maxium number of unique compounds to pull
PERCENT_TO_KEEP = float(default=0.0) # Percent of compounds to retain
KEEP_CHARGES = boolean(default=False) # Whether to save the partial charges
CHARGE_PROPERTY = string(default="r_m_charge1") # Charge property to keep
"""
stage.Stage.__init__(self, specs=specs, *args, **kwargs)
self.addExpectedInput(1, "structures", True)
self.addExpectedInput(2, "structures", True)
self.addExpectedOutput(1, "structures", True)
self.input_st_count = 0
[docs] def operate(self):
"""
Extract a subset of the first set compounds from the second set of
ligand input structure files. Makes a list of compounds (identified
by UNIQUEFIELD) from the first set, pares down the list according to
the NUM_TO_KEEP or PERCENT_TO_KEEP setting, and extract all variants
of those compounds from the second set. Raises a RuntimeError if
any input file cannot be read, if there is a problem accessing the
UNIQUEFIELD property of any ligand, if there is a problem writing any
output file, or if no ligands are extracted.
"""
ligfiles = self.getInput(1).getFiles()
self.pull_field = self['UNIQUEFIELD']
num_comps_to_keep = self['NUM_TO_KEEP']
per_comps_to_keep = self['PERCENT_TO_KEEP']
self.keep_charges = self['KEEP_CHARGES']
self.charge_prop = self['CHARGE_PROPERTY']
self.info("Generating a set of compounds in the subset")
all_compounds = []
# Key: compound code; value: list of partial charges lists
self.partial_charges = {}
st_num = 0
for ligfile in ligfiles:
is_pv_file = is_poseviewer_file(ligfile)
for st in pipeutils.get_reader(ligfile):
st_num += 1
if st_num == 1 and is_pv_file:
continue # Skip the receptor
# Print progress period:
if st_num % 1000 == 0:
sys.stdout.write(".")
sys.stdout.flush()
try:
code = st.property[self.pull_field]
except KeyError:
err = "Error reading field %s for st %i (file %s)" % (
self.pull_field, st_num, ligfile)
raise RuntimeError(err)
if self.keep_charges:
try:
charges_list = [
a.property[self.charge_prop] for a in st.atom
]
except KeyError:
err = "Can't find '%s' charge properties for st %i (file %s). Skipping." % (
self.charge_prop, st_num, ligfile)
self.warning(err)
continue
if code not in all_compounds:
all_compounds.append(code)
self.partial_charges[code] = [charges_list]
else:
self.partial_charges[code].append(charges_list)
# not keeping charges:
elif code not in all_compounds:
all_compounds.append(code)
input_compound_count = len(all_compounds)
self.info('') # line break after all the periods
if num_comps_to_keep: # Number of compounds to keep specified
if num_comps_to_keep > input_compound_count:
num_comps_to_keep = input_compound_count
self.compounds_to_keep = all_compounds[0:num_comps_to_keep - 1]
msg = "Pulling %i of %i compounds." % (num_comps_to_keep,
input_compound_count)
self.info(msg)
elif per_comps_to_keep: # Percentage of compounds to keep specified
int_percent = int(per_comps_to_keep)
num_comps_to_keep = int(per_comps_to_keep * input_compound_count /
100)
if num_comps_to_keep < 1:
num_comps_to_keep = 1
if num_comps_to_keep > input_compound_count:
num_comps_to_keep = input_compound_count
self.compounds_to_keep = all_compounds[:num_comps_to_keep]
if int_percent == per_comps_to_keep:
self.info(" Pulling %i" % int_percent + "% of compounds.")
else:
self.info(" Pulling %f" % per_comps_to_keep +
"% of compounds.")
msg = " Actually pulling %i of %i compounds." % (
num_comps_to_keep, input_compound_count)
self.info(msg)
else:
# Keeping all structures
self.compounds_to_keep = all_compounds
msg = " Pulling all %i compounds from the original set" % input_compound_count
self.info(msg)
self.pullCompounds()
[docs] def pullCompounds(self):
# Convert to a set for faster lookups:
self.compounds_to_keep_set = set(self.compounds_to_keep)
self.compounds_to_keep = None # to save memory
origligfiles = self.getInput(2).getFiles()
outfile = self.genOutputFileName(1, extension=".maegz")
writer = structure.StructureWriter(outfile)
st_num = 0
kept_num = 0
for ligfile in origligfiles:
for st in pipeutils.get_reader(ligfile):
st_num += 1
# Print progress period:
if st_num % 1000 == 0:
sys.stdout.write(".")
sys.stdout.flush()
try:
code = st.property[self.pull_field]
except KeyError:
raise RuntimeError(
"Error reading property %s for structure %i of file %s"
+ (self.pull_field, st_num, ligfile))
continue
if code in self.compounds_to_keep_set:
# if code in self.compounds_to_keep:
kept_num += 1
if self.keep_charges:
# If KEEP_CHARGES is set, restore the partial charges:
# usually only one item in this list
for charges_list in self.partial_charges[code]:
for i, atom in enumerate(st.atom):
charge = charges_list[i]
atom.partial_charge = charge
atom.solvation_charge = charge
writer.append(st)
# print 'DEBUG: Re-applied charges to structure',
# kept_num
else: # not keeping charges
writer.append(st)
writer.close()
subset_files = [outfile]
print("\n") # 2 new lines (after all those progress periods)
if kept_num == 0:
raise RuntimeError(
"Could not find the ligands in the original files")
self.setOutput(1, pipeio.Structures(subset_files, kept_num))
# Clear partial charges to save space & time when dumping the stage
# later:
self.partial_charges = {}
# EOF