Source code for schrodinger.pipeline.stages.macromodel

# python module
"""
Stages for running MacroModel jobs.

    ConfSearchStage - stage for running MacroModel conf-search calculations.
    SampleRingsStage - stage for sampling large rings

Copyright Schrodinger, LLC. All rights reserved.

"""
# Contributors: Matvey Adzhigirey, Shawn Watts

##########################################################################
# Packages
##########################################################################

import os
import sys

import schrodinger.application.macromodel.input as mmod_input
import schrodinger.application.macromodel.tools as mt
import schrodinger.structure as structure
from schrodinger.pipeline import pipeio
from schrodinger.pipeline import pipeutils
from schrodinger.pipeline import stage

MACROMODEL = os.path.join(os.getenv('SCHRODINGER'), 'macromodel')

##########################################################################
# Classes
##########################################################################


[docs]class ConfSearchStage(stage.Stage):
    """
    Stage for running MacroModel calculations
    """

[docs]    def __init__(self, *args, **kwargs):
        specs = mmod_input.GENERAL_SPECS + \
            mmod_input.MINIMIZER_SPECS + mmod_input.CONFSEARCH_SPECS
        specs += ["SERIAL_SPLIT_OUTPUT = string(default=False)"]
        stage.Stage.__init__(self, specs=specs, *args, **kwargs)

        # Used by Pipeline to associate -host_mmod with this stage:
        self.setMainProduct('macromodel')
        self.requiredProduct('macromodel')
        self.addExpectedInput(1, "structures", True)
        self.addExpectedOutput(1, "structures", True)

        self.jobdj = None
        self.status = "NOT STARTED"

        return

[docs]    def recombineInputLigands(self):
        """
        Read all input structure files, and write them to separate subjob input
        files. The number of subjobs is determined by the Pipeline and adjusted
        so that the subjob size is within a good limit, if requested by the
        user.
        """

        input_files = self.getInput(1).getFiles()
        self.info("  Counting number of input structures")

        (self.input_variants_count, self.input_compound_count) = \
            pipeutils.countRoots(input_files)

        msg = "\n  Number of input structures: " + \
            str(self.input_variants_count)
        self.info(msg)

        msg = "  Number of input compounds: " + str(self.input_compound_count)
        self.info(msg)

        # ConfSearchStage
        # Determine the number of subjobs to create:
        # NOTE: This min/max range is used only when the Pipeline driver
        # requests the number of jobs to be adjusted to yield optimal sizes
        min_job_size = 50  # About 1 hour
        max_job_size = 500  # About 10 hours
        (n_st_per_file, njobs, adjusted) = \
            self.getAdjustedNJobs(
                self.input_variants_count,
                min_job_size,
                max_job_size)
        if adjusted:
            self.info(
                "  Adjusting number of jobs to yield %i-%i ligands per job" %
                (min_job_size, max_job_size))
        self.info(
            "  Separating ligands into %i subjobs (%i max ligands per job)." %
            (njobs, n_st_per_file))

        writer = structure.MultiFileStructureWriter(self.genFileName(),
                                                    extension="_in.maegz",
                                                    sts_per_file=n_st_per_file)

        st_num = 0

        for ligfile in input_files:
            for st in pipeutils.get_reader(ligfile):
                st_num += 1
                if st_num % 1000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()

                writer.append(st)

        writer.close()  # close last writer
        output_files = writer.getFiles()

        self.checkFiles(output_files)

        self.bmin_input_files = output_files
        return

[docs]    def setupJobs(self):
        """
        Build up command to use to launch each subjob, and add that command
        to JobDJ.
        """
        self.jobdj = self.getJobDJ()
        self.expected_jobdj_outputs = []

        filenum = 0
        for ligfile in self.bmin_input_files:
            filenum += 1
            jobname = self.genFileName(filenum=filenum, extension="")

            inpfile = jobname + '.inp'
            outfile = jobname + '-out.maegz'
            self.writeSIFFile(ligfile, inpfile, outfile)
            cmd = [MACROMODEL, inpfile]

            self.jobdj.addJob(cmd)
            self.expected_jobdj_outputs.append(outfile)

        return

[docs]    def processJobOutputs(self):
        """
        After subjobs are complete, they are combined into a few output files
        """

        outligands = self.expected_jobdj_outputs

        # Set output_files to input ligands plus the output ligands:
        self.info("  BMIN output files: " + str(outligands))
        self.info("  Combining the BMIN output files")

        st_num = 0

        writer = structure.MultiFileStructureWriter(self.genFileName(),
                                                    extension=".maegz")

        for ligfile in outligands:
            for st in pipeutils.get_reader(ligfile):
                st_num += 1
                if st_num % 1000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()

                writer.append(st)

        # Print return after priods:
        self.info("")

        output_files = writer.getFiles()

        self.setOutput(1, pipeio.Structures(output_files, st_num))
        return

[docs]    def splitJobOutputs(self):
        # FIXME: Add documentation

        csearch_out = self.expected_jobdj_outputs

        self.info("  BMIN output files: " + str(csearch_out))
        self.info("  Splitting the BMIN output files")
        split_out = []  # split by serial number.
        for outfile in csearch_out:
            split_out.extend(mt.serial_split(outfile))

        self.setOutput(1, pipeio.Structures(split_out))

[docs]    def operate(self):
        """
        Run MacroModel conformer serch operation on the input files
        """

        if self.status in ["NOT STARTED", "PREPARING INPUT FILES"]:
            self.status = "PREPARING INPUT FILES"
            self.recombineInputLigands()
            self.status = "SETTING UP JOBS"
            self.dump()

        if self.status == "SETTING UP JOBS":
            self.setupJobs()
            self.status = "RUNNING JOBS"
            self.dump()

        if self.status == "RUNNING JOBS":
            # Update JobDJ to correct options (may change if restarting):
            self.setJobDJOptions(self.jobdj)
            self.runJobDJ(self.jobdj)

            self.status = "PROCESSING FILES"
            self.dump()

        if self.status == "PROCESSING FILES":
            if self['SERIAL_SPLIT_OUTPUT'] == 'True':
                self.splitJobOutputs()
            else:
                self.processJobOutputs()
            self.status = "COMPLETE"
            return

        # COMPLETE
        return

[docs]    def writeSIFFile(self, ligfile, inpfile, outfile):

        fh = open(inpfile, 'w')

        kw_dict = {}
        for key in self.keys():
            if key not in ['SERIAL_SPLIT_OUTPUT']:
                kw_dict[key] = self[key]

        kw_dict['INPUT_STRUCTURE_FILE'] = ligfile
        kw_dict['OUTPUT_STRUCTURE_FILE'] = outfile

        fh = open(inpfile, 'w')
        fh.write("# Input file for $SCHRODINGER/macromodel\n")
        config = mmod_input.ConfSearch(kw_dict)
        config.writeInputFile(fh)
        fh.close()
        # print 'Input file written:', inpfile


[docs]class SampleRingsStage(stage.Stage):
    """
    Stage for sampling rings with 7 or more members using MacroModel
    """

[docs]    def __init__(self, *args, **kwargs):

        # FIXME allow more MacroModel keywords
        # Like this:
        # specs = mmod_input.GENERAL_SPECS + mmod_input.MINIMIZER_SPECS + mmod_input.CONFSEARCH_SPECS
        # specs += [
        #    "SERIAL_SPLIT_OUTPUT = string(default=False)"
        #]
        #stage.Stage.__init__(self, specs=specs, *args, **kwargs)

        specs = """
        FORCE_FIELD = string(default="OPLS_2005") # force field to use
        SOLVENT = string(default="Water") # Solvent model
        ELECTROSTATIC_TREATMENT = string(default="Constant dielectric")
        CHARGES_FROM = string(default="Force field")
        CUTOFF = string(default="Extended")
        OUTCONFS_PER_SEARCH = integer(default=1) # 1 output per input structure
        MAXIMUM_ITERATION = integer(default=500)
        CONVERGE_ON = string(default="Gradient")
        CONFSEARCH_STEPS_PER_ROTATABLE = integer(default=10)
        CONVERGENCE_THRESHOLD = float(default=0.05)
        CONFSEARCH_STEPS = integer(default=100)
        """
        stage.Stage.__init__(self, specs=specs, *args, **kwargs)

        self.setMainProduct('macromodel')
        self.requiredProduct('macromodel')

        self.addExpectedInput(1, "structures", True)
        self.addExpectedOutput(1, "structures", True)

        self._job_outputs = []
        self.samplerings_input_files = None

        self.jobdj = None

        self.status = "NOT STARTED"

[docs]    def prepareInputLigands(self):

        input_object = self.getInput(1)
        input_files = input_object.getFiles()

        self.through_ligands_file = \
            self.genFileName(extension='_through_ligands.maegz')
        self.presample_ligands_file = \
            self.genFileName(extension='_presample_ligands.maegz')

        # Split up the files into 2 files: one for ligands that ringconf did
        # well on, and another for problematic ligands.

        self.presample_num = 0
        self.through_num = 0

        through_writer = structure.StructureWriter(self.through_ligands_file)
        presample_writer = structure.StructureWriter(
            self.presample_ligands_file)

        st_num = 0
        for ligfile in input_files:
            for st in pipeutils.get_reader(ligfile):
                st_num += 1
                if st_num % 1000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()

                result = st.property.get('i_lp_ring_sampling_problem')

                if result is None:  # property does not exist
                    through_writer.append(st)
                    self.through_num += 1
                else:  # property exists. Will be 2 for partial match
                    # and 1 for full mismatch.
                    presample_writer.append(st)
                    self.presample_num += 1

        through_writer.close()
        presample_writer.close()

        if self.presample_num == 0:
            self.info(
                "No ligands need ring sampling. Returning a copy of the input structures object."
            )

            self.setOutput(
                1,
                pipeio.Structures([self.through_ligands_file],
                                  self.through_num))

            return False  # No need to sample
        else:
            msg = "\nringconf Problem ligands: %s" % self.presample_ligands_file
            self.info(msg)

            if not self.through_num:
                self.through_ligands_file = None
            msg = "\nringconf Non-problem ligands: %s" % self.through_ligands_file
            self.info(msg)

        # SampleRingsStage
        # Determine the number of subjobs to create:
        # NOTE: This min/max range is used only when the Pipeline driver
        # requests the number of jobs to be adjusted to yield optimal sizes
        #
        # NOTE These job sizes have NOT been tested. How long the jobs take
        # largely depends on how many of the input structures need to be
        # processed
        min_job_size = 100
        max_job_size = 5000
        (n_st_per_file, njobs,
         adjusted) = self.getAdjustedNJobs(self.presample_num, min_job_size,
                                           max_job_size)
        if adjusted:
            self.info(
                "  Adjusting number of jobs to yield %i-%i ligands per job" %
                (min_job_size, max_job_size))
        self.info(
            "  Separating ligands into %i subjobs (%i max ligands per job)." %
            (njobs, n_st_per_file))

        #
        # Split up the ligands into the appropriate number of subjobs:
        #
        writer = structure.MultiFileStructureWriter(self.genFileName(),
                                                    extension="_in.maegz",
                                                    sts_per_file=n_st_per_file)

        st_num = 0
        for st in pipeutils.get_reader(self.presample_ligands_file):
            st_num += 1
            # Print progress period:
            if st_num % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()

            writer.append(st)

        self.samplerings_input_files = writer.getFiles()

        return True  # Need to sample rings

[docs]    def setupJobs(self):
        """
        Built up the MacroModel command for each subjob, and add it to JobDJ
        """

        self.jobdj = self.getJobDJ()
        self.expected_jobdj_outputs = []

        filenum = 0
        for ligfile in self.samplerings_input_files:
            filenum += 1
            jobname = self.genFileName(filenum=filenum, extension="")

            inpfile = jobname + '.inp'
            outfile = jobname + '-out.maegz'
            self.writeSIFFile(ligfile, inpfile, outfile)
            cmd = [MACROMODEL, inpfile]

            self.jobdj.addJob(cmd)
            self.expected_jobdj_outputs.append(outfile)

[docs]    def processJobOutputs(self):
        """
        Is run after all subjobs are complete to recombine the output ligands
        """

        # FIXME: Get rid of expected_jobdj_outputs and use jobnames instead.
        self.checkFiles(self.expected_jobdj_outputs)

        # FIXME: Use outname = self.getOutputName(1)

        self.info("  BMIN output files: %s" % self.expected_jobdj_outputs)
        self.info("  Combining the BMIN output files")

        #
        # Combine the through ligands and the bmim output ligands:
        #

        if self.through_num:
            # Some ligands did not need ring sampling:
            outligand_files = [self.through_ligands_file] + \
                self.expected_jobdj_outputs
        else:
            # All ligands needed ring sampling (Ev:56857):
            outligand_files = self.expected_jobdj_outputs

        writer = structure.MultiFileStructureWriter(self.genFileName(),
                                                    extension=".maegz")

        for ligfile in outligand_files:
            st_num = 0
            for st in pipeutils.get_reader(ligfile):
                # Print progress period:
                st_num += 1
                if st_num % 1000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()

                writer.append(st)

        # Print return after priods:
        self.info("")

        output_files = writer.getFiles()

        self.setOutput(1, pipeio.Structures(output_files, st_num))

[docs]    def operate(self):
        """ Perform an operation on the input files. """

        #enter_status = self.status
        if self.status in ["NOT STARTED", "PREPARING INPUT FILES"]:

            self.status = "PREPARING INPUT FILES"
            need_sampling = self.prepareInputLigands()
            if need_sampling:
                self.status = "SETTING UP JOBS"
            else:
                self.status = "COMPLETE"
            self.dump()

        if self.status == "SETTING UP JOBS":
            self.setupJobs()
            self.status = "RUNNING JOBS"
            self.dump()

        if self.status == "RUNNING JOBS":
            # Update JobDJ to correct options (may change if restarting):
            self.setJobDJOptions(self.jobdj)
            self.runJobDJ(self.jobdj)

            self.status = "PROCESSING FILES"
            self.dump()

        if self.status == "PROCESSING FILES":
            self.processJobOutputs()
            self.status = "COMPLETE"
            self.dump()

        # COMPLETE
        return

[docs]    def writeSIFFile(self, ligfile, inpfile, outfile):

        kw_dict = {}
        for key in self.keys():
            if key not in []:
                kw_dict[key] = self[key]

        kw_dict['INPUT_STRUCTURE_FILE'] = ligfile
        kw_dict['OUTPUT_STRUCTURE_FILE'] = outfile

        # MCMM may be ineffective for constrained systems and low mode seems
        # to perform better; a mixed of the 2 seems to be the best balance
        # for sampling rings in ligand-like molecules:
        kw_dict['CONFSEARCH_METHOD'] = "Mixed"

        kw_dict['MULTI_LIGAND'] = True
        kw_dict['JOB_TYPE'] = 'CONFSEARCH'

        fh = open(inpfile, 'w')
        fh.write("# Input file for $SCHRODINGER/macromodel\n")
        config = mmod_input.ConfSearch(kw_dict)
        config.writeInputFile(fh)
        fh.close()

        return inpfile


# EOF