Source code for schrodinger.pipeline.stages.gencodes

"""
Stage for generating and storing codes for ligands/compounds and variants.

It also recombined input ligands into a number of subjobs specified via -NJOBS
argument (If -adjust is used, this number is also adjusted based on the
MIN_SUBJOB_STS and MAX_SUBJOB_STS keywords.

Copyright Schrodinger, LLC. All rights reserved.

"""
# Contributors: Matvey Adzhigirey

import os
import sys

from schrodinger import structure
from schrodinger.pipeline import pipeio
from schrodinger.pipeline import pipeutils
from schrodinger.pipeline import stage
from schrodinger.utils.fileutils import SMILES
from schrodinger.utils.fileutils import SMILESCSV
from schrodinger.utils.fileutils import get_structure_file_format
from schrodinger.utils.fileutils import is_poseviewer_file


[docs]class RecombineStage(stage.Stage):
    """
    Stage for generating compound and variant codes, which are stored as
    properties.

    It also recombined input ligands into a number of subjobs specified via
    -NJOBS argument (If -adjust is used, this number is also adjusted based
    on the MIN_SUBJOB_STS and MAX_SUBJOB_STS keywords.

    See the specs for the keywords specific to this stage.

    A ligand/compound is identified by the UNIQUEFIELD property, and
    structures with the same value of this property are indexed as
    variants.  If UNIQUEFIELD is not specified, each structure is treated as
    a compound (i.e., there are no variants), and the label for the compound
    is just the structure index number as all the files and structures are
    read.

    The output structures will have their OUTCOMPOUNDFIELD set to their compound
    label (either the UNIQUEFIELD property value or the structure index). If
    OUTCOMPOUNDFIELD is "s_m_title", the original titles will be saved to the
    's_pipeline_title_backup' property.

    OUTVARIANTFIELD property ('s_pipeline_variant' by default) is set to
    "<compound_label>-<variant_index>".

    The stage takes one input structure file set and generates one set of
    output structure files, each containing about 100,000 structures.
    """

[docs]    def __init__(self, *args, **kwargs):
        """
        See class docstring.
        """

        specs = """
        GENCODES = boolean(default=True) # Whether to generate compound and variant codes.
        NUMOUT = option("preserve", "njobs", "fewest", default="preserve") # How to generate the number of output files. "preserve" - same as the input; "fewest" - as few output files as possible; "njobs" - derive from -NJOBS option.
        PRESERVE_NJOBS = boolean(default=False) # If not recombining,
        UNIQUEFIELD = string(default="NONE") # Property that is unique to each compound. If "NONE", then each structure is unique.
        OUTCOMPOUNDFIELD = string(default="s_pipeline_compound_code") # Field to put the compound codes into (if GENCODES is True).
        OUTVARIANTFIELD = string(default="s_pipeline_variant") # Field to put variant codes into (if GENCODES is True)
        OUTFORMAT = option("maegz", "mae", "sdf", default="maegz") # Output format
        SKIP_BAD_LIGANDS = boolean(default=True)
        SKIP_RECEPTOR = boolean(default=True) # Whether to remove receptors from the set (PV files only)
        MIXLIGS = boolean(default=False) # Whether to re-shuffle the ligands between subjobs.
        SKIP_NOUNIQUE_LIGANDS = boolean(default=False) # Whether to skip ligands that have no unique field.
        MIN_SUBJOB_STS = integer(default=None) # Minimum subjob size
        MAX_SUBJOB_STS = integer(default=None) # Maximum subjob size
        """
        stage.Stage.__init__(self, specs=specs, *args, **kwargs)

        self.addExpectedInput(1, "structures", True)
        self.addExpectedOutput(1, "structures", True)

[docs]    def operate(self):
        """
        Generates ligand compound and variant codes, and saves the structures into
        fewer number of files (max 100,000 per file, except a new output file
        must start with a new compound).  Adds the compound label,
        compound-and-variant, and original title to the output structures as
        properties.  Raises a RuntimeError if the output file format is
        invalid, if there is a problem reading an input file or writing an
        output file, or if the UNQIUEFIELD property doesn't exist for a
        structure.
        """
        """
        Read all input structures, while optionally skipping bad structures,
        and save each structure into a subjob input file.

        Also OUTCOMPOUNDFIELD is set to the property specified by UNIQUEFIELD.
        If UNIQUEFIELD is set to NONE, then OUTCOMPOUNDFIELD is set to a number
        which is unique for each input ligand.

        If UNIQUEFIELD is NONE, then OUTCOMPOUNDFIELD
        is determined either from UNIQUEFILED property, or each structure
        is assumed to be its own compound.
        If OUTCOMPOUNDFIELD is "s_m_title", then the original title is backed
        up into the "s_pipeline_title_backup" property.

        Structures can also be shuffled within subjobs in order to make
        subjob lengths more uniform if MIXLIGS is set to True.

        """

        self.input_files = self.getInput(1).getFiles()
        outformat = self['OUTFORMAT']
        if self['NUMOUT'] == 'njobs':
            self.info(
                "The number of output files will be derived from -NJOBS value")
            numout_njobs = True
            preserve_njobs = False
            min_job_size = self['MIN_SUBJOB_STS']
            max_job_size = self['MAX_SUBJOB_STS']
        elif self['NUMOUT'] == 'preserve':
            self.info("Will preserve the number of files")
            numout_njobs = False
            preserve_njobs = True
        else:
            self.info("Will create as few output files as possible")
            numout_njobs = False
            preserve_njobs = False
        if self['MIXLIGS']:
            self.warning('WARNING: Keyword MIXLIGS is no longer supported')

        gencodes = self['GENCODES']

        # These keywords are ignored if not generating codes:
        uniquefield = self['UNIQUEFIELD']
        variantfield = self['OUTVARIANTFIELD']
        compoundfield = self['OUTCOMPOUNDFIELD']

        # Make sure the keywords are specified properly:
        if outformat not in ["mae", "maegz", "sdf"]:
            raise RuntimeError('Output Format needs to be "mae" or "sdf"!')

        if not gencodes:
            # Don't generate codes
            for keyword in [
                    'UNIQUEFIELD', 'OUTVARIANTFIELD', 'OUTCOMPOUNDFIELD'
            ]:
                if self[keyword]:
                    self.warning("GENCODES is False; ignoring %s keyword" %
                                 keyword)

        if numout_njobs:
            # If NUMOUT was set to "njobs", we need to count the number of
            # input structures:

            if hasattr(self, 'input_st_num'):  # already counted
                # Already calculated
                self.info(
                    "Number of input structures was already calculated: %i" %
                    self.input_st_num)
            else:
                # FIXME If input number of structures is know, use it

                self.info("Counting number of input structures")
                input_st_num = 0
                for filename in self.input_files:
                    try:
                        input_st_num += structure.count_structures(filename)
                    except:
                        self.exit("ERROR Could not get st count for file: " +
                                  filename)
                self.input_st_num = input_st_num
                self.dump()  # Save the instance

            self.info("Number of input structures: %i" % self.input_st_num)

            # Determine the number of subjobs to split the ligand set into:
            (n_st_per_file, njobs,
             adjusted) = self.getAdjustedNJobs(self.input_st_num, min_job_size,
                                               max_job_size)
            if adjusted:
                self.info(
                    "Adjusting number of jobs to yield %i-%i ligands per job" %
                    (min_job_size, max_job_size))
            self.info("Will generate %i subjobs" % njobs)

        elif preserve_njobs:
            njobs = len(self.input_files)
            self.info("Separating ligands into %i subjobs." % njobs)

        self.outfile_compounds = {}  # Number of compounds in each input job
        self.outfile_structures = {}  # Number of structures in each input job
        recombined_writers_dict = {}  # keys are filenums

        # The compound code of the previous structure:
        current_compound_id = None
        current_compound_id_variant = 0
        number_of_compounds = 0

        in_st_num = 0  # Processing ligand #
        output_structure_count = 0

        max_file_size = 100000  # Just less than 2GB

        if gencodes:
            self.info("  Will generate ligand compound and variant codes...")
            if uniquefield == "NONE":
                self.info(
                    "    Each ligand will be treated as a unique compound.")
            else:
                self.info("    Unique field: " + uniquefield)
            sys.stdout.flush()

        self.info("\nProcessing ligand files...")

        output_files = []
        curr_writer = None
        prev_filenum = None
        filenum = 0

        if numout_njobs:
            dp = pipeutils.DotPrinter(self.input_st_num)
        else:
            dp = pipeutils.DotPrinter()

        num_skipped_sts = 0
        in_st_num = 0

        infilenum = 0
        for infile in self.input_files:
            infilenum += 1
            iformat = get_structure_file_format(infile)

            self.debug("\nReading file: %s (format %s)" % (infile, iformat))

            # Will open unknown files as SD:
            if iformat == SMILES:
                sts = structure.SmilesReader(infile)
            elif iformat == SMILESCSV:
                sts = structure.SmilesCsvReader(infile)
            else:
                sts = pipeutils.get_reader(infile)

            if iformat in [SMILES, SMILESCSV]:
                if compoundfield != 's_m_title':
                    msg = "ERROR: Could not save compound codes to property '%s' because the input structure is in SMILES format" % compoundfield
                    self.exit(msg)

            st_file_pos = 0  # Location of this structure in the input file

            while True:  # Iterate until StopIteration:
                # For each structure in this file:
                try:
                    st = next(sts)
                except StopIteration:
                    break  # EOF, exit
                except Exception:  # Bad structure
                    if self['SKIP_BAD_LIGANDS']:
                        num_skipped_sts += 1
                        in_st_num += 1
                        st_file_pos += 1
                        continue  # Skip the bad structure
                    else:
                        msg = "ERROR: Could not read next structure from file: %s" % infile
                        msg += "Try setting SKIP_BAD_LIGANDS to True"
                        self.exit(msg)
                else:  # Read OK
                    in_st_num += 1

                # If got here, then read the structure okay
                st_file_pos += 1
                is_receptor = False

                # Skip first structure if PV file is specified:
                if st_file_pos == 1 and is_poseviewer_file(infile):
                    if self['SKIP_RECEPTOR']:
                        continue  # Skip the first structure (receptor)
                    else:
                        is_receptor = True

                # Print progress period:
                dp.dot()

                if not is_receptor:
                    # Set the compound code:
                    if uniquefield == 'NONE':
                        compound_id = str(in_st_num)
                    else:
                        try:
                            # Ev:96648 & Ev: 104439
                            compound_id = pipeutils.read_unique_field(
                                st, uniquefield)
                        except KeyError:
                            msg = "ERROR: File %s; ligand %i is missing property: %s" % (
                                os.path.basename(infile), st_file_pos,
                                uniquefield)
                            if self['SKIP_NOUNIQUE_LIGANDS']:
                                self.warning(msg)
                                continue  # Go to next input ligand
                            else:
                                self.exit(msg)

                    # Figure out if it's a new compound or not, and if it's a
                    # new variant or not:
                    if compound_id != current_compound_id or current_compound_id is None:
                        # New compound:
                        current_compound_id_variant = 1
                        number_of_compounds += 1
                        current_compound_id = compound_id
                    else:
                        # Another variant of the same compound:
                        current_compound_id_variant += 1

                    if gencodes:
                        if self['OUTCOMPOUNDFIELD'] == 's_m_title':
                            if iformat not in [SMILES, SMILESCSV]:
                                st.property[
                                    "s_pipeline_title_backup"] = st.title
                            # Will work for SmilesStructures also
                            st.title = compound_id
                        else:
                            # FIXME will not work for SmilesStructures:
                            st.property[self['OUTCOMPOUNDFIELD']] = compound_id

                        # Set the variant code:
                        s = "%s-%i" % (compound_id, current_compound_id_variant)
                        st.property[variantfield] = s

                # Figure out which file to write to:
                if is_receptor:
                    if filenum == 0:
                        filenum = 1
                    else:
                        pass  # write to the same file

                # If the compound_id (compound code) for this ligand is the same as
                # for the previous ligand, write them to the same subjob file.
                elif current_compound_id_variant == 1:
                    # This compound is different from a previous compound.
                    # (will also happen for the first structure)
                    # Consider writing it to a different file.

                    if filenum == 0:
                        filenum = 1
                    elif preserve_njobs:
                        # "preserve"
                        # Requested to keep number of subjobs the same:
                        # Put this structure into the same file as it came
                        # from:
                        filenum = infilenum

                    elif numout_njobs:
                        # "njobs"
                        if self.outfile_structures[filenum] >= n_st_per_file:
                            filenum += 1

                    else:
                        # "fewest"
                        # Requested to keep number of output files to a
                        # minimum:
                        if filenum == 0:
                            filenum = 1
                        elif self.outfile_structures[filenum] >= max_file_size:
                            filenum += 1

                    # Record number of compounds that is in this output file:
                    try:
                        self.outfile_compounds[filenum] += 1
                    except KeyError:
                        self.outfile_compounds[filenum] = 1

                if filenum != prev_filenum:
                    prev_filenum = filenum
                    if iformat == SMILES:
                        outfile = self.genFileName(filenum=filenum,
                                                   extension="-in.smi")
                    elif iformat == SMILESCSV:
                        outfile = self.genFileName(filenum=filenum,
                                                   extension="-in.csv")
                    else:
                        outfile = self.genFileName(filenum=filenum,
                                                   extension="-in.maegz")
                    if curr_writer:
                        # Close the previous file handle VSW-891
                        curr_writer.close()
                    curr_writer = structure.StructureWriter(outfile)

                    # Sanity check:
                    assert outfile not in output_files

                    output_files.append(outfile)
                    self.outfile_structures[filenum] = 0

                # Write the output structure to file:
                curr_writer.append(st)
                self.outfile_structures[filenum] += 1
                output_structure_count += 1

            if in_st_num < 1:
                self.exit(
                    "\nInput file did not contain any readable structures: " +
                    infile)

        if output_structure_count > 0:
            # Close the last file handle:
            curr_writer.close()

        output_compound_count = sum(self.outfile_compounds.values())

        # Print summary:
        self.info('')  # To have a new line after the progress periods
        self.info("Number of input structures: %i" % (in_st_num))
        self.info("Number of processed structures: %i" % output_structure_count)
        self.info("Number of processed compounds: %i" % output_compound_count)
        if num_skipped_sts:
            self.warning(
                "\n%i structures were skipped because they were invalid." %
                num_skipped_sts)
        self.info("Number of output files: %i" % len(output_files))

        if True:
            # Report to the log file the numbers of structure per each subjob:
            num_list = []
            for i, numposes in self.outfile_structures.items():
                num_compounds = self.outfile_compounds[i]
                # NUM:(ROOTS, STRUCTUES)
                num_list.append("%i:(%i, %i)" % (i, num_compounds, numposes))

            msg = "\nNumber of (compounds, structures) for each output file:"
            msg += "\n%s" % ", ".join(num_list)
            self.info(msg)

        self.setOutput(1, pipeio.Structures(output_files,
                                            output_structure_count))


# Ev:78695
[docs]class RestoreTitlesStage(stage.Stage):
    """
    Stage for copying titles for each input ligand from the
    's_pipeline_title_backup' property to the 's_m_title' property.

    The stage takes one input structure file set and generates one set of
    output structure files, each containing about 100,000 structures.
    """

[docs]    def __init__(self, *args, **kwargs):
        """
        See class docstring.
        """

        specs = """
        SOURCE_FIELD = string(default="s_pipeline_title_backup")
        DESTINATION_FIELD = string(default="s_m_title")
        """
        stage.Stage.__init__(self, specs=specs, *args, **kwargs)

        self.addExpectedInput(1, "structures", True)
        self.addExpectedOutput(1, "structures", True)

[docs]    def operate(self):
        """
        For each structure, sets the value of the DESTINATION_FIELD to the
        the value of the SOURCE_FIELD. If no such value exists, raises
        a RuntimeError.
        """

        self.input_files = self.getInput(1).getFiles()

        self.source_field = self['SOURCE_FIELD']
        self.destination_field = self['DESTINATION_FIELD']

        self.info('Will copy values from property "%s" to property "%s"' %
                  (self.source_field, self.destination_field))

        self.info("\nProcessing ligand files...")

        dp = pipeutils.DotPrinter()

        writer = structure.MultiFileStructureWriter(self.genOutputFileName(1),
                                                    extension=".maegz")

        st_num = 0
        for ligfile in self.input_files:
            for st in pipeutils.get_reader(ligfile):
                st_num += 1

                # Print progress period:
                dp.dot()

                try:
                    value = st.property[self.source_field]
                except KeyError:
                    raise RuntimeError(
                        'Error reading property "%s" for structure %i of file "%s"'
                        + (self.source_field, st_num, ligfile))
                try:
                    st.property[self.destination_field] = value
                except:
                    raise RuntimeError(
                        'Error setting property "%s" for structure %i of file "%s"'
                        + (self.destination_field, st_num, ligfile))

                writer.append(st)

        writer.close()
        output_files = writer.getFiles()

        self.setOutput(1, pipeio.Structures(output_files, st_num))

        # Print summary:
        self.info('')  # To have a new line after the progress periods
        self.info("Number of processed structures: %i" % (st_num))
        self.info("Number of output files: %i" % len(output_files))


# EOF