Source code for schrodinger.protein.tasks.descriptors

import csv
import os
from typing import List

from schrodinger import structure
from schrodinger.application.msv import seqio
from schrodinger.models import parameters
from schrodinger.protein import alignment
from schrodinger.protein import sequence
from schrodinger.protein.properties import PropertySource
from schrodinger.tasks import jobtasks
from schrodinger.tasks import tasks
from schrodinger.utils import csv_unicode
from schrodinger.utils import scollections

try:
    from schrodinger.application.prime.packages.protein_descriptors import \
        seq_desc
except ImportError:
    seq_desc = None


[docs]class StructureDescriptorsTask(jobtasks.CmdJobTask):
    """
    Given a list of structures, generate descriptors for them using the
    `calc_protein_descriptors.py` backend.

    Example usage::

        task = StructureDescriptorsTask()
        task.input.sts = list(structure.StructureReader('1cmy.pdb'))
        task.start()

    """
    DEFAULT_TASKDIR_SETTING = jobtasks.CmdJobTask.AUTO_TASKDIR
    PROP_SOURCE = PropertySource.Structure

[docs]    class Input(parameters.CompoundParam):
        seqs: List[sequence.AbstractSequence]

[docs]    class Output(jobtasks.CmdJobTask.Output):
        """
        :ivar descriptors: A dictionary mapping the (ID of the) original
            sequence to a dictionary which maps names of descriptors to their
            values
        :vartype descriptors: scollections.IdDict
        """
        descriptors: scollections.IdDict

    @jobtasks.preprocessor(order=tasks.AFTER_TASKDIR)
    def _writeInputStructures(self):
        self._seq_map = dict()

        os.mkdir(self._getInputDirectory())
        fname = os.path.join(self._getInputDirectory(), 'sts.mae')
        with structure.StructureWriter(fname) as st_writer:
            for idx, seq in enumerate(self.input.seqs):
                st = seq.getStructure()
                st.title = f"{st.title}_{idx}"
                self._seq_map[st.title] = seq
                st_writer.append(st)

    @jobtasks.preprocessor(order=tasks.AFTER_TASKDIR)
    def _makeOutputDirectory(self):
        os.mkdir(self._getOutputDirectory())

    def _getInputDirectory(self):
        return os.path.join(self.getTaskDir(), 'inp_sts')

    def _getOutputDirectory(self):
        return os.path.join(self.getTaskDir(), 'outp_sts')

[docs]    def makeCmd(self):
        # Output directory must be specified as local path:
        out_dir = os.path.basename(self._getOutputDirectory())
        return [
            '$SCHRODINGER/run',
            '-FROM',
            'psp',
            'calc_protein_descriptors.py',
            '-i',
            self._getInputDirectory(),
            '-o',
            out_dir,
        ]

    @tasks.postprocessor()
    def _processOutput(self):
        """
        Process the csv output from the descriptors backend.

        Each row corresponds to an input sequence and each column is a
        descriptor
        """
        out_csv = os.path.join(self._getOutputDirectory(),
                               "protein_descriptors.csv")
        with csv_unicode.reader_open(out_csv) as f:
            csv_reader = csv.reader(f)
            offset = 1  # the first column is the structure title
            property_names = next(csv_reader)[offset:]
            for row in csv_reader:
                title = row[0]
                vals = row[offset:]
                seq = self._seq_map.get(title)
                if seq:
                    prop_dict = dict(zip(property_names, vals))
                    self.output.descriptors[seq] = prop_dict


[docs]class SequenceDescriptorsTask(jobtasks.CmdJobTask):
    """
    Given a list of sequences, generate descriptors for them using the
    `calc_sequence_descriptors.py` backend.

    Example usage::

        task = SequenceDescriptorsTask()
        task.input.sts = list(alignment.ProteinAlignment(['VVVV', 'AAA', 'TT']))
        task.start()
    """
    DEFAULT_TASKDIR_SETTING = jobtasks.CmdJobTask.TEMP_TASKDIR
    PROP_SOURCE = PropertySource.Sequence

[docs]    class Input(parameters.CompoundParam):
        """
        :param seqs: Sequence to calculate descriptors for.

        :param specified_descriptors: What descriptors to calculate for `seqs`.
            If empty, all possible descriptors will be calculated.
        """
        seqs: List[sequence.AbstractSequence]
        requested_descriptors: List[str]

[docs]    class Output(jobtasks.CmdJobTask.Output):
        """
        :ivar descriptors: A dictionary mapping the (ID of the) original
            sequence to a dictionary which maps names of descriptors to their
            values
        :vartype descriptors: scollections.IdDict
        """
        descriptors: scollections.IdDict

    @jobtasks.preprocessor(order=tasks.BEFORE_TASKDIR)
    def _checkPSPInstalled(self):
        if seq_desc is None:
            raise RuntimeError("Prime must be installed to run a "
                               "SequenceDescriptorsTask.")

    @jobtasks.preprocessor(order=tasks.AFTER_TASKDIR)
    def _createSpecifiedDescriptorsFile(self):
        if self.input.requested_descriptors:
            all_descriptors = set(seq_desc.get_all_descriptor_names())
            for desc in self.input.requested_descriptors:
                if desc.upper() not in all_descriptors:
                    raise ValueError(f"{desc} is not a valid descriptor name.")
            with open(self.getTaskFilename('requested_descriptors.txt'),
                      'w') as inp_file:
                inp_file.write('\n'.join(self.input.requested_descriptors))

    @jobtasks.preprocessor(order=tasks.AFTER_TASKDIR)
    def _writeInputSequences(self):
        aln = alignment.ProteinAlignment(self.input.seqs)

        # Write alignment as fasta and get the sequence names the writer used to
        # write the file
        os.mkdir(self._getInputDirectory())
        fname = os.path.join(self._getInputDirectory(), 'inp.fasta')
        names = seqio.FastaAlignmentWriter.write(aln, fname)
        # Backend changes commas to underscores, presumably because the
        # output is a csv
        names = [name.replace(',', '_') for name in names]

        # Create a mapping between the names and the original sequence
        self._seq_map = dict(zip(names, self.input.seqs))

    def _getInputDirectory(self):
        return os.path.join(self.getTaskDir(), 'inp_seqs')

[docs]    def makeCmd(self):
        cmd = [
            '$SCHRODINGER/run', '-FROM', 'psp', 'calc_sequence_descriptors.py',
            '-i',
            self._getInputDirectory()
        ]

        if self.input.requested_descriptors:
            cmd.extend(
                ['-d', self.getTaskFilename('requested_descriptors.txt')])
        return cmd

    @tasks.postprocessor()
    def _processOutput(self):
        """
        Process the csv output from the descriptors backend.

        Each row corresponds to an input sequence and each column is a
        descriptor
        """
        with csv_unicode.reader_open(
                self.getTaskFilename("sequence_descriptors.csv")) as f:
            csv_reader = csv.reader(f)
            offset = 2  # The first two columns is the index and fasta header
            property_names = next(csv_reader)[offset:]
            for row in csv_reader:
                seq_header = row[1]
                long_name, *_ = seqio.parse_in_house_header(seq_header)
                vals = row[offset:]
                seq = self._seq_map.get(long_name)
                if seq:
                    prop_dict = dict(zip(property_names, vals))
                    self.output.descriptors[seq] = prop_dict

        if not seq_desc:
            print("Prime is not installed. Not computing residue descriptors")
            return
        for seq in self._seq_map.values():
            three_letter_seq = [res.long_code for res in seq if not res.is_gap]
            res_descriptors = seq_desc.calc_protscale_descriptors(
                three_letter_seq)
            for idx, res in enumerate(seq.residues()):
                d_dict = {}
                for descriptor_name, profile in res_descriptors.items():
                    seq_desc_name = seq_desc.PROT_SCALES_LOOKUP.get(
                        descriptor_name, [''])[0]
                    if seq_desc_name in self.input.requested_descriptors:
                        d_dict[seq_desc_name] = profile[idx]
                res.updateDescriptors(d_dict)