Source code for schrodinger.application.bioluminate.reactive_residues

"""
Identify protein sequences that are likely:

1) Deamidation sites.

Standard rules exist for determining deamidation sites, i.e. X-Asn-Y and
X-Gln-Y are considered the hottest deamidation targets, with Y=Gly, Ala, Ser,
and with the rate determined to some extent by the nature of X (higher for X =
polar).  There seems to be some correlation with flexibility at the region, as
well.

2) Oxidation sites.

Identification of potential oxidation sites would start with highlighting the
His, Met, Cys, Trp and Tyr residues.

3) Glycosylation sites (most common).

N-linked: Asn. Consensus sequence: Asn-X-Ser/Thr (X=! Pro)
O-linked: Ser and Thr. No consensus sequence. We do not identify these.
At greater detail, there is some indication that Asn-X-Ser/Thr-Y can be
considered and the tendency toward glycosylation is impacted by Y. (See, e.g.
Mellquist et al Biochemistry (1998) 12, 6833-7).

4) Proteolysis hot spots.

These would be Asp residues. The cleavage can occur at either the N or
C-terminal end.


Copyright (c) Schrodinger, LLC. All rights reserved
"""

from collections import defaultdict
from past.utils import old_div

from schrodinger.protein import sequence
from schrodinger.structutils import analyze
from schrodinger.structutils import transform
from schrodinger.ui.sequencealignment import find_gui
from schrodinger.ui.sequencealignment.structure_utils import calculate_sasa_dict

STD_SASA_DICT = None


[docs]class ReactiveResidues:
    """
    Find the reactive residues on a protein based on sequence patterns.
    """

[docs]    def __init__(self,
                 st,
                 search_pre_defined=True,
                 custom_patterns=[]):  # noqa: M511
        """
        :type st: _Structure object
        :params st: input structure

        :type search_pre_defined: boolean
        :params search_pre_defined: whether or not search the pre-defined patterns for reactive sites

        :type custom_patterns: a list of tuple (name, pattern_str, color)
        :params custom_patterns: input custom patterns defining the reactive sites
        """
        self.st = st
        self.search_pre_defined = search_pre_defined
        self.custom_patterns = custom_patterns

        # store the sequences
        self.sequences = list(sequence.get_structure_sequences(self.st))

        # Calculate the atomic SASAs. Exclude the waters, as they affect the SASA calculation:
        self.sasa_by_atom = analyze.calculate_sasa_by_atom(self.st,
                                                           exclude_water=True)

[docs]    def generateCSVTable(self):
        """
        Analyze the given structure and return the results summary in a CSV table
        """
        csv_table = []
        csv_table.append(['Residues', 'Type', 'SASA', 'Exposure', 'B Factor'])
        for res in self.iterateResidues():
            if res:
                if res.bfactor is None:
                    bfac = "NA"
                else:
                    bfac = round(res.bfactor, 2)
                if res.percent_exposure is None:
                    exp = "NA"
                else:
                    exp = str(res.percent_exposure) + '%'
                csv_table.append(
                    [res.res_str,
                     str(res.type),
                     round(res.sasa, 2), exp, bfac])
        return csv_table

[docs]    def iterateResidues(self):
        """
        Iterate over all residues and yield ResidueData objects (or None if the residue is not reactive)
        Note that one residue can be multiple reactive types and yielded multiple times
        """
        for seq in self.sequences:
            # Calculate everything that is necessary for doing pattern-matching
            # for this sequence:
            seq_dict = sequence.convert_structure_sequence_for_pattern_search(
                seq, self.sasa_by_atom)

            # Check residues that match to custom patterns
            matched_residues = defaultdict(list)
            for name, pattern, color in self.custom_patterns:
                found_patterns = sequence.find_generalized_pattern(
                    [seq_dict], pattern, validate_pattern=False)
                if found_patterns is False:
                    # Should never happen, as the patterns are validated earlier
                    raise ValueError('Invalid pattern specified: "%s"' %
                                     pattern)
                else:
                    for res_index, ignored in found_patterns[0]:
                        # note that the hotspot position is in the pattern string
                        matched_residues[res_index].append((name, color))
                        # one residue can match multiple patterns

                    # Iterate over all residues, and yield ResidueData objects:
            residue_list = list(seq.residue)
            for i, curr_res in enumerate(residue_list):
                if i == 0 or i == (len(residue_list) - 1):
                    # Can't analyze first or last residue
                    continue
                try:
                    prev_res = residue_list[i - 1]
                except IndexError:
                    continue
                try:
                    next_res = residue_list[i + 1]
                except IndexError:
                    continue

                # Check whether it is reactive based on custom criteria:
                custom_res_types = matched_residues.get(i, [])

                # Check whether it is reactive based on built-in criteria:
                if self.search_pre_defined:
                    # FIXME pull this code out into a method:
                    pre_resname = prev_res.pdbres.upper().strip()
                    curr_resname = curr_res.pdbres.upper().strip()
                    next_resname = next_res.pdbres.upper().strip()
                    pre_defined_res_types = self._checkGroupForTypes(
                        pre_resname, curr_resname, next_resname)
                else:
                    pre_defined_res_types = []

                if not custom_res_types and not pre_defined_res_types:
                    yield None
                    continue

                # If got here, this residue is reactive
                try:
                    prev_res_acarbon = prev_res.getAlphaCarbon().index
                    curr_res_acarbon = curr_res.getAlphaCarbon().index
                    next_res_acarbon = next_res.getAlphaCarbon().index
                except AttributeError as err:
                    # Ev:117846 At least one of these residue had no alpha-carbon
                    print("WARNING: %s" % err)
                    yield None
                    continue

                for res_type in pre_defined_res_types:
                    custom_color = None
                    if res_type == find_gui.GLYCOSYLATION_N_PATTERN:
                        # reactive residue is the first
                        res = prev_res
                        alpha_carbon = prev_res_acarbon
                        neighbor_alphas = [curr_res_acarbon, next_res_acarbon]
                    else:
                        res = curr_res
                        alpha_carbon = curr_res_acarbon
                        neighbor_alphas = [prev_res_acarbon, next_res_acarbon]
                    yield ResidueData(res, res_type, alpha_carbon,
                                      neighbor_alphas, custom_color,
                                      self.sasa_by_atom)

                for pattern, custom_color in custom_res_types:
                    res_type = "Custom (%s)" % pattern
                    # Converting from 0-255 ints to 0-1 floats:
                    custom_color = [old_div(x, 255.0) for x in custom_color]
                    res = curr_res
                    alpha_carbon = curr_res_acarbon
                    neighbor_alphas = [prev_res_acarbon, next_res_acarbon]
                    yield ResidueData(res, res_type, alpha_carbon,
                                      neighbor_alphas, custom_color,
                                      self.sasa_by_atom)

    def _checkGroupForTypes(self, pre_resname, curr_resname, next_resname):
        """
        Return a list of reactive residue types for residue "curr_resname" ("pre_resname" for Glycosylation).
        if not matched, return an empty list
        """

        types = []
        if curr_resname in ["ASN", "GLN"]:
            if next_resname in ["GLY", "ALA", "SER"]:
                # the other residue is X; rate is higher for more polar X
                types.append(find_gui.DEAMIDATION_PATTERN)

        # Oxydation
        # Identification of potential oxidation sites would start with highlighting
        # the His, Met, Cys, Trp and Tyr residues.

        if curr_resname in ["HIS", "MET", "CYS", "TRP", "TYR"]:
            types.append(find_gui.OXIDATION_PATTERN)

        # Glycosylation, N-linked: Asn-X-Ser/Thr (X != Pro), single direction
        # from N terminal to C terminal
        if (pre_resname == "ASN" and next_resname in ["SER", "THR"]):
            if curr_resname != "PRO":
                # curr_resname is X
                types.append(find_gui.GLYCOSYLATION_N_PATTERN)

        # At greater detail, there is some indication that Asn-X-Ser/Thr-Y can be
        # considered and the tendency toward glycosylation is impacted by Y.

        # Porteolysis, Asp residues. The cleavage can occur at either the N or C terminal end.
        if curr_resname == "ASP":
            types.append(find_gui.PROTEOLYSIS_PATTERN)

        return types

[docs]    def getNumResidues(self):
        """
        return the total number of residues in the sequences that will be analyzed.
        """
        num_residues = 0
        for seq in self.sequences:
            num_residues += len(seq.residue)
        return num_residues


[docs]class ResidueData:
    """
    A container holding the data about the reactive residue
    """

[docs]    def __init__(self, res, res_type, alpha_carbon, neighbor_alphas,
                 custom_color, sasa_by_atom):
        """
        :type res: structure._Residue object
        :params res: the reactive residue

        :type res_type: str
        :params res_type: the name for the type of the reactive residue

        :type alpha_carbon: int
        :params alpha_carbon: Ca atom index of the reactive residue

        :type neighor_alphas: a list of 2 integers
        :params neighbor_alphas: Ca atom indices of previous and next residue to the reactive residue

        :type custom_color: tuple of 3 integers, i.e. rgb color
        :params custom_color: color for custom pattern

        :type sasa_by_atom: list
        :params sasa_by_atom: SASA of all atoms of the given structure

        """
        self.residue = res  # _Residue object
        self.res_str = str(res) + " %s" % res.pdbres.strip()
        self.type = res_type
        self.acarbon = alpha_carbon
        self.neighbor_alphas = neighbor_alphas
        self.custom_color = custom_color
        self.reactive_atoms = res.getAtomIndices()

        # Figure out where the center of the side chain is:
        res_atoms = res.getAtomIndices()
        res_atoms_asl = analyze.generate_asl(res.structure, res_atoms)
        side_chain_atoms = analyze.evaluate_asl(
            res._st, res_atoms_asl + " AND sidechain")

        # Ev:117851 If a residue has no side chain, use all atoms to determine the center:
        if not side_chain_atoms:
            side_chain_atoms = res_atoms

        self.side_chain_center = transform.get_centroid(res.structure,
                                                        side_chain_atoms)[:3]

        # Calculate the SASA (don't do it for all residues at once, as then
        # we would not get a progress dialog bar):
        # self.sasa = analyze.calculate_sasa(res2._st, side_chain_atoms)
        self.sasa = sum(
            [sasa_by_atom[atomnum - 1] for atomnum in side_chain_atoms])

        global STD_SASA_DICT
        if STD_SASA_DICT is None:
            STD_SASA_DICT = calculate_sasa_dict()

        resname = res.pdbres.upper().strip()
        try:
            std_sasa = STD_SASA_DICT[resname]
        except KeyError:
            print("WARNING: No standard SASA found for residue: %s" % resname)
            self.percent_exposure = None
        else:
            sasa_exposure_ratio = old_div(self.sasa, std_sasa)
            self.percent_exposure = int(sasa_exposure_ratio * 100.0)
            if self.percent_exposure > 100:
                # Should never happen, but if it would, the user would be confused
                self.percent_exposure = 100

        b_factor = res.temperature_factor
        self.bfactor = b_factor if b_factor > 0 else None