Source code for schrodinger.structutils.smilesfilter

"""
A group of functions and classes to help filtering structure files on
unique SMILES strings.

Copyright Schrodinger, LLC. All rights reserved.

"""

# Contributors: Mike Beachy, Matvey Adzhigirey
import os
import sys
from textwrap import dedent

import schrodinger.structure as structure
import schrodinger.structutils.smiles as smiles
import schrodinger.utils.cmdline
import schrodinger.utils.fileutils as futils
import schrodinger.utils.log as log

_version = "$Revision: 1.20 $"

_output_logger = log.get_output_logger(
    "output.schrodinger.structutils.smilesfilter")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Functions
#


[docs]def remove_dupes(smiles_generator, struct_iterator, put_unique, put_dupes=None, error_handler=None, reporter=None): """ Process structures from the provided structure iterator 'struct_iterator' using the 'smiles_generator' to compute unique SMILES strings. Unique structures are passed to the 'put_unique' callback and duplicates to 'put_dupes' (if provided). :type smiles_generator: SmilesGenerator :type struct_iterator: iterator return Structure objects :param struct_iterator: Any iterator returning Structure objects (such as a StructureReader) will work. :type put_unique: callable :type put_dupes: callable :param put_dupes: These functions will be called with (structure, SMILES string) arguments in the appropriate situation. :type error_handler: callable :param error_handler: A callable that takes (index, structure, exception) for cases where SMILES generation generates a RuntimeError. :type reporter: FilterReporter :param reporter: If present, information will be logged. """ index = 0 dupe_count = 0 error_count = 0 seen = {} for st in struct_iterator: index += 1 try: smiles = smiles_generator.getSmiles(st) orig_index = seen.get(smiles, 0) dupe_save_index = 0 if orig_index: dupe_count += 1 if put_dupes is not None: put_dupes(st, smiles) dupe_save_index = dupe_count else: seen[smiles] = index put_unique(st, smiles) if reporter: reporter.logResult(index, orig_index, st.title, smiles, dupe_save_index) except RuntimeError as e: error_count += 1 if error_handler: error_handler(index, st, e) else: raise if reporter: reporter.summarize(index, len(seen), dupe_count, error_count) return
[docs]def add_smiles(smiles_generator, struct_iterator, put_output, error_handler=None): """ Calculate SMILES strings with 'smiles_generator' for each structure in 'struct_iterator'. Parameters smiles_generator (SmilesGenerator) struct_iterator (iterator return Structure objects) Any iterator returning Structure objects (such as a StructureReader) will work. put_output (callable) This function will be called with (structure, SMILES string) arguments when a SMILES string can be calculated. error_handler (callable) A callback function that takes (index, structure, exception) and is called in cases where SMILES generation generates a RuntimeError. """ index = 0 error_count = 0 for st in struct_iterator: index += 1 try: smiles = smiles_generator.getSmiles(st) put_output(st, smiles) except RuntimeError as e: error_count += 1 if error_handler: error_handler(index, st, e) else: raise return
[docs]class FilterReporter(object): """ A class to handle reporting of results from the remove_dupes function. """
[docs] def __init__(self, logger): """ Parameters logger (logging.Logger) The logger instance that will be used to output messages. """ self.logger = logger
[docs] def logResult(self, index, orig_index, title, smiles, dupe_index): """ Log messages for a given result. Parameters index (int) Index of the current structure. orig_index (int) Index of the unique structure of which the current one is a duplicate. title (str) Title of the current structure. smiles (str) SMILES string of the structure. dupe_index (int) Index of the duplicate structure in the saved duplicates file. """ lines = [""] if orig_index: lines.append("Structure %d is a duplicate of structure %d." % (index, orig_index)) else: lines.append("Structure %d is unique." % (index,)) lines.append(" title: %s" % title) lines.append(" smiles: %s" % smiles) if dupe_index: lines.append(" destination: Saved as structure %d in the " "duplicates file." % dupe_index) report = "\n".join(lines) # Log duplicates at the default INFO level but non-duplicates only # at DEBUG level. if orig_index: self.logger.info(report) else: self.logger.debug(report) return
[docs] def summarize(self, total, unique, duplicates, error_count): """ Generate a summary of the filter results. Parameters total (int) Total number of structures filtered. unique (int) Number of unique structures found. duplicates (int) Number of duplicates found. error_count (int) Number of structures generating errors in SMILES conversion. """ width = len(str(total)) def _plural(count): if count != 1: return "s" else: return "" if error_count: errors_str = """ %*d error%s""" % (width, error_count, _plural(error_count)) else: errors_str = "" message = dedent( """ Summary: %*d structure%s read %*d unique structure%s written %*d duplicate%s%s""" % (width, total, _plural(total), width, unique, _plural(unique), width, duplicates, _plural(duplicates), errors_str)) self.logger.info(message)
[docs]class SmilesErrorHandler(object): """ A class that acts as an error handler for cases where SMILES generation fails. It is used as a callable that takes arguments of (index, struct, exception). """
[docs] def __init__(self, logger, struct_writer=None, message=None): """ Parameters logger (logging.Logger) The logger instance that will be used to output messages. struct_writer (obj with append method) An instance that will log structures that have errors. Any instance with an append method (such as a StructureWriter or list) can be used. message (str) The message to use as the logging template. It should have format conversion specifiers for (index, structure title, exception) (types of int, str, exception). """ self.logger = logger self.struct_writer = struct_writer if message is not None: self.message = message else: self.message = "Error: SMILES generation failed for " + \ "structure %d with title '%s'.\nThe error message was '%s'."
def __call__(self, index, struct, exception): """ This logs the message to the object's 'logger' attribute and appends the structure to the object's 'struct_writer', which can be anything with an append method. """ self.logger.warning(self.message % (index, struct.title, exception)) if self.struct_writer: self.struct_writer.append(struct)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Below are functions to support running this module as a script. # DUPES_IGNORE, DUPES_DISCARD, DUPES_SAVE = ("ignore", "discard", "save") def _parse_options(args): """ Parse command-line options. Parameters args (list of str) This should be the list of arguments without any program name at the 0 index (e.g. pass in sys.argv[1:], not sys.argv). Like OptionParser.parse_args(), return a tuple of (options, arguments). (Here, options is an optparse.Values instance and arguments is a list of the remaining command-line arguments.) Raises SystemExit exception if the arguments are not good. """ global _output_logger OptionParser = schrodinger.utils.cmdline.SingleDashOptionParser usage = dedent( """\ usage: uniquesmiles [options] <input structure file> <output file(s)> Supported input formats: Maestro and SDF. Supported output formats: SMILES (.smi) and Maestro (both can be specified). Generate unique SMILES strings for the input structures, optionally removing repeated structures from the output. For Maestro output, the unique smiles string will be saved in a structure property named %s or %s, depending on whether stereochemical information is included in the string.""" % (smiles.unique_smiles_prop_name, smiles.unique_smiles_stereo_prop_name)) parser = OptionParser(version_source=__doc__, usage=usage) parser.add_option( "-stereo", action="store", type="choice", choices=["auto", "3d", "annotation", "none"], dest="stereo", help= """Specify a stereochemistry option. The following options are supported: 'auto' (derive stereochemistry from geometry for 3D structures and annotations for 2D structures), '3d' (derive stereochemistry from 3D geometry, ignoring annotations), 'annotation' (derive stereochemistry from pre-existing mmstereo properties), or 'none' (don't include stereochemistry in the SMILES). Default is 'auto'. NOTE: In previous release, the current "auto" option was called "3d".""" ) parser.add_option("-quiet", action="store_const", const=log.CRITICAL, dest="loglevel", help="Run with as little output as possible.") parser.add_option( "-verbose", action="store_const", const=log.DEBUG, # Set logging level a little more verbose than # INFO, but not as verbose as DEBUG. dest="loglevel", help="Run with verbose output.") parser.add_option( "-dupes", action="store", type="choice", choices=[DUPES_IGNORE, DUPES_DISCARD, DUPES_SAVE], dest="dupes", help= "Choose destiny of structures having duplicate SMILES strings: 'ignore', 'discard', or 'save'. If 'ignore' (the default), do nothing; if 'discard', discard duplicates; if 'save', write them to a separate file named <output_name>-dupes.<mae|smi>." ) # # DEFAULT OPTION VALUES # Set defaults for all possible options. # parser.set_defaults( dupes=DUPES_IGNORE, loglevel=log.INFO, stereo="auto", # Update -stereo help comment if this changes. ) opts, filenames = parser.parse_args(args) # Make sure that two or more file arguments are provided. if len(filenames) < 2: parser.print_help() sys.exit(1) informat = futils.get_structure_file_format(filenames[0]) if informat not in [futils.MAESTRO, futils.SD]: parser.print_help() _output_logger.warning( "Only Maestro and SD files can be used for input.") sys.exit(1) if not os.path.exists(filenames[0]): _output_logger.warning("The input file '%s' could not be found." % filenames[0]) sys.exit(1) # Convert the stereo option into the value that needs to be passed into # the SmilesGenerator. stereo_opt_map = { "auto": smiles.STEREO_FROM_ANNOTATION_AND_GEOM, "3d": smiles.STEREO_FROM_GEOMETRY, "annotation": smiles.STEREO_FROM_ANNOTATION, "none": smiles.NO_STEREO, } opts.stereo_enum = stereo_opt_map.get(opts.stereo) # Configure the output logger level. _output_logger.setLevel(opts.loglevel) return opts, filenames def _make_smiles_writer(filename): """ Create a function that takes a structure and a SMILES string, and writes a SMILES format file to file 'filename'. """ sfile_out = structure.SmilesWriter(filename) def put_output(st, smiles): smiles_st = structure.SmilesStructure(smiles, st.title) sfile_out.append(smiles_st) return put_output def _make_mae_writer(filename, propname): """ Returns a function that takes a structure and a SMILES string. The created function sets the SMILES property and writes the structure to the mae file 'filename'. """ maefile_out = structure.MaestroWriter(filename) # This file will be closed when garbage collected. def put_output(st, smiles): st.property[propname] = smiles maefile_out.append(st) return put_output def _main(args=None): """ The main routine for filtering and SMILES generation. Parameters args (list of str) This should be the list of arguments without any program name at the 0 index (e.g. pass in sys.argv[1:], not sys.argv). Raises RuntimeError if a filename argument has an unrecognized extension (i.e., format). """ global _output_logger if args is None: args = sys.argv[1:] # Get options (opts, files) = _parse_options(args) _output_logger.debug("Command-line arguments: %s\n" % " ".join(args)) inputfile = files[0] # Create the functions that take (structure, SMILES string) arguments # and save the results for unique/output and/or duplicate structures. propname = smiles.get_property_name(opts.stereo_enum) put_outputs = [] put_dupes = [] # The output file basename is determined by the MAE output file if one # is present. If the only output is a SMILES file, that will be used to # set the basename instead. basename = None for ofile in files[1:]: dir_name, file_name = os.path.split(ofile) base, ext = futils.splitext(file_name) if ext == ".smi": # Only define basename if it hasn't been defined by a .mae file. if not basename: basename = base put_outputs.append(_make_smiles_writer(ofile)) if opts.dupes == DUPES_SAVE: dfile = os.path.join(dir_name, base + "-dupes" + ext) put_dupes.append(_make_smiles_writer(dfile)) elif ext in [".mae", '.maegz', '.mae.gz']: basename = base put_outputs.append(_make_mae_writer(ofile, propname)) if opts.dupes == DUPES_SAVE: dfile = os.path.join(dir_name, base + "-dupes" + ext) put_dupes.append(_make_mae_writer(dfile, propname)) else: print("ERROR: Output file extension '%s' is not " % ext + "recognized as a valid output type.") print("Supported extensions: *.mae, *.maegz, *.mae.gz, *.smi") sys.exit(1) # Create a function that iterates over all functions in put_outputs and # calls it with the provided structure and smiles arguments. put_output = lambda st, smiles: [fn(st, smiles) for fn in put_outputs] if put_dupes: # Create a function that iterates over all functions in put_dupes # and calls it with the provided structure and smiles arguments. put_dupe = lambda st, smiles: [fn(st, smiles) for fn in put_dupes] else: put_dupe = None # Create an error handler that will write bad structures to a file. errors = structure.MaestroWriter(basename + "-errors.mae") error_handler = SmilesErrorHandler(_output_logger, errors) smiles_generator = smiles.SmilesGenerator(opts.stereo_enum, unique=True) struct_reader = structure.StructureReader(inputfile) # Ignore duplicates; just add SMILES strings and write to a new file. if opts.dupes == DUPES_IGNORE: add_smiles(smiles_generator, struct_reader, put_output, error_handler) # Filter out duplicates, either dumping them or saving them to a new # file. else: # DUPES_DISCARD or DUPES_SAVE # Add an extra newline to the error handler because the remove_dupes # filter function has multi-line output and it needs the separation. error_handler.message += "\n" remove_dupes(smiles_generator, struct_reader, put_output, put_dupe, error_handler, FilterReporter(_output_logger)) return 0 if __name__ == "__main__": from schrodinger.utils.cmdline import main_wrapper main_wrapper(_main)