Source code for schrodinger.protein.annotation

# -*- coding: utf-8 -*-
"""
Annotations for biological sequences

Copyright Schrodinger, LLC. All rights reserved.
"""

import collections
import enum
import itertools
import math
import re
import warnings
from enum import Enum
from functools import cached_property

from Bio import SeqIO

from schrodinger.application.msv import utils as msv_utils
from schrodinger.application.msv.utils import get_rolling_average
from schrodinger.infra import util
from schrodinger.models import jsonable
from schrodinger.Qt import QtCore
from schrodinger.structutils import analyze

DEFAULT_WINDOW_PADDING = 4

_MODULE_NOT_YET_IMPORTED = object()
antibody = _MODULE_NOT_YET_IMPORTED
_SeqType = None
_disulfide_bond_title = "S-S bond"
_ssa_title = "SSA"


def _delayed_antibody_import():
    """
    The antibody modules imports alignment, which imports this module.  To avoid
    a circular import, we delay importing antibody until it's needed.  (See
    `_AntibodyCDRFinder.__init__`.)
    """
    global antibody
    try:
        from schrodinger.application.prime.packages import antibody
    except ImportError:
        # Prime is not installed.  We can still create _AntibodyCDRFinder
        # instances, but they won't find any CDRs.
        antibody = None
    else:
        global _SeqType

        class _SeqType(SeqTypeMixin, antibody.SeqType):
            pass


LOGO_MAX_DIVERSITY = math.log(20.0, 2.0)


class _BindingSiteDistance(jsonable.JsonableEnum):
    d3 = 3
    d4 = 4
    d5 = 5
    d6 = 6
    d7 = 7
    d8 = 8


BindingSiteDistance = util.enum_speedup(_BindingSiteDistance)
BINDING_SITE = Enum('BINDING_SITE', ['CloseContact', 'FarContact', 'NoContact'])

AntibodyCDRLabel = Enum('AntibodyCDRLabel',
                        ['NotCDR', 'L1', 'L2', 'L3', 'H1', 'H2', 'H3'])
AntibodyCDR = collections.namedtuple('AntibodyCDR', 'label start end')
Domains = Enum('Domains', ['Domain', 'NoDomain'])

AntibodyCDRScheme = util.enum_speedup(
    jsonable.JsonableEnum(
        'AntibodyCDRScheme',
        ['Chothia', 'Kabat', 'IMGT', 'EnhancedChothia', 'AHo']))

DEFAULT_ANTIBODY_SCHEME = AntibodyCDRScheme.Kabat
ANTIBODY_CDR_ITEMS = {
        "Chothia": AntibodyCDRScheme.Chothia,
        "Enhanced Chothia": AntibodyCDRScheme.EnhancedChothia,
        "Kabat": AntibodyCDRScheme.Kabat,
        "IMGT": AntibodyCDRScheme.IMGT,
        "AHo": AntibodyCDRScheme.AHo
    } # yapf: disable


[docs]@enum.unique
class KinaseConservation(jsonable.JsonableEnum):
    VeryLow = "Very Low"
    Low = "Low"
    Medium = "Medium"
    High = "High"
    VeryHigh = "Very High"


[docs]@enum.unique
class KinaseFeatureLabel(jsonable.JsonableEnum):
    GLYCINE_RICH_LOOP = "Glycine Rich Loop"
    ALPHA_C = "Alpha-C"
    GATE_KEEPER = "Gate Keeper"
    HINGE = "Hinge"
    LINKER = "Linker"
    HRD = "HRD"
    CATALYTIC_LOOP = "Catalytic Loop"
    DFG = "DFG"
    ACTIVATION_LOOP = "Activation Loop"
    NO_FEATURE = "No Feature"


[docs]class Consensus(Enum):

    not_conserved = " "
    fully_conserved = "*"
    strongly_conserved = ":"
    weakly_conserved = "."

    @property
    def tooltip(self):
        return self.name.replace("_", " ")


class _AnnotationEnum(jsonable.JsonableEnum):
    """
    An enum class with a title for use in row headers.  The title is based on
    the enum name by default but can be overridden if desired.
    """

    def __init__(self, _=None):
        # if an Enum subclass defines an init the value of the enum will be
        # passed to it
        super(_AnnotationEnum, self).__init__()
        self._title = None
        self._tooltip = ''

    @property
    def title(self):
        if self._title is not None:  # could be a custom empty string
            return self._title
        else:
            return self.name.capitalize().replace("_", " ")

    @title.setter
    def title(self, value):
        self._title = value

    @property
    def tooltip(self):
        # Default is no tooltip. May override with custom tooltip.
        return self._tooltip

    @tooltip.setter
    def tooltip(self, value):
        self._tooltip = value


class _SequenceAnnotationEnum(_AnnotationEnum):
    """
    An enum class for annotations that apply to a single sequence (as opposed to
    global annotations that apply to the alignment as a whole).

    :ivar requires_structure: Does this annotation only apply to sequences that
        have an associated structure?
    :vartype requires_structure: bool

    :ivar multi_value: Can this annotation have multiple values?
    :vartype multi_value: bool

    :ivar can_expand: Can this annotation be used to expand selection?
        Should only be set to True for annotations with categorical values
        (e.g. enum, string, or limited int values). Annotations may need
        special handling in `getAnnotationValueForComparison` in sequence.py
    :vartype can_expand: bool
    """

    def __init__(self, _):
        super(_SequenceAnnotationEnum, self).__init__()
        self.requires_structure = False
        self.multi_value = False
        self.can_expand = False


[docs]class TupleWithRange(tuple):

    @cached_property
    def range(self):
        """
        The range of data contianed in this tuple.  Will return a tuple of
        (minimum value or zero whichever is less, maximum value or zero
        whichever is greater).  `None` values will be ignored.  If there are no
        `None` values in this tuple, will return (0, 0).
        :rtype: tuple(int or float, int or float)
        """
        values = [val for val in self if val is not None]
        values.append(0)
        return min(values), max(values)


[docs]class AbstractSequenceAnnotations(QtCore.QObject):
    """
    A base class for single-chain and combined-chain sequence annotations

    :ivar titleChanged: A signal emitted after an annotation's title (row header)
        changes.
    :vartype titleChanged: `QtCore.pyqtSignal`
    """

    titleChanged = QtCore.pyqtSignal()
    # avoid circular refs between sequences and annotations
    sequence = util.WeakRefAttribute()

[docs]    def __init__(self, seq):
        """
        :param seq: The sequence to store annotations for.
        :type seq: sequence.Sequence
        """
        super().__init__()
        self.sequence = seq


[docs]class AbstractProteinSequenceAnnotationsMixin:

    domainsChanged = QtCore.pyqtSignal()
    invalidatedDomains = QtCore.pyqtSignal()

[docs]    def __init__(self, *args, **kwargs):
        """
        :param seq: The sequence to store annotations for.
        :type seq: sequence.Sequence
        """
        super().__init__(*args, **kwargs)
        self._binding_sites = None
        self._ligands = None
        self._ligand_asls = None
        self._domains = None
        self._domains_parsed = False

    @cached_property
    def max_b_factor(self):
        values = [val for val in self.b_factor if val is not None]
        return max(values) if len(values) > 0 else 0.0

    @cached_property
    def min_b_factor(self):
        values = [val for val in self.b_factor if val is not None]
        return min(values) if len(values) > 0 else 0.0

[docs]    @QtCore.pyqtSlot()
    def invalidateMaxMinBFactor(self):
        try:
            del self.max_b_factor
        except AttributeError:
            pass
        try:
            del self.min_b_factor
        except AttributeError:
            pass

[docs]    def getAntibodyCDR(self, col, scheme):
        """
        Returns the antibody CDR information of the col'th index in the sequence
        under a given antibody CDR numbering scheme.

        :param col: index into the sequence
        :type col: int

        :param scheme: The antibody CDR numbering scheme to use
        :type scheme: `AntibodyCDRScheme`

        :returns: Antibody CDR label, start, and end positions
        :rtype: `AntibodyCDR`, which is a named tuple of
                (`AntibodyCDRLabel`, int, int) if col is in a CDR,
                otherwise (AntibodyCDRLabel.NotCDR, None, None)
        """
        raise NotImplementedError

[docs]    def getAntibodyCDRs(self, scheme):
        """
        Returns a list of antibody CDR information for the entire sequence.

        :param scheme: The antibody CDR numbering scheme to use
        :type scheme: AntibodyCDRScheme

        :returns: A list of Antibody CDR labels, starts, and end positions
        :rtype: list(AntibodyCDR)
        """
        raise NotImplementedError

[docs]    def isAntibodyChain(self):
        """
        :returns: Whether the sequence described is an antibody chain
        :rtype: bool
        """
        raise NotImplementedError

[docs]    def isAntibodyHeavyChain(self):
        """
        :returns: Whether the sequence described is an antibody heavy chain
        :rtype: bool
        """
        raise NotImplementedError

[docs]    def isAntibodyLightChain(self):
        """
        :returns: Whether the sequence described is an antibody light chain
        :rtype: bool
        """
        raise NotImplementedError

    @property
    def binding_sites(self):
        if self._binding_sites is None:
            self._updateLigandContacts()
        return self._binding_sites

    @property
    def ligands(self):
        if self._ligands is None:
            self._updateLigandContacts()
        return self._ligands

    @property
    def ligand_asls(self):
        if self._ligand_asls is None:
            self._updateLigandContacts()
        return self._ligand_asls

    def _updateLigandContacts(self):
        """
        Update the ligand contacts cache based on the sequence's structure
        """
        raise NotImplementedError

    @QtCore.pyqtSlot()
    def _invalidateLigandContacts(self):
        """
        Invalidates the cached ligand contact data.
        """
        self._binding_sites = None
        self._ligands = None

[docs]    def setLigandDistance(self, distance):
        """
        Updates the ligand distance and invalidates the cache
        """
        raise NotImplementedError

    @property
    def domains(self):
        if self._domains is None and self._domains_parsed:
            seen = set()
            domains = []
            for res in self.sequence.residues():
                if res.domains is None:
                    continue
                for domain_name in res.domains:
                    if domain_name in seen:
                        continue
                    seen.add(domain_name)
                    domains.append(domain_name)
            self._domains = domains
        return self._domains

[docs]    def getSSBondPartner(self, index):
        """
        Return the residue's intra-sequence disulfide bond partner, if any.

        If the residue is not involved in a disulfide bond, its partner has
        been deleted, or its partner is in another sequence, it will return
        None.

        :param index: Index of the residue to check
        :type index: int

        :return: the other Residue in the disulfide bond or None
        :rtype: schrodinger.protein.residue.Residue or None

        """
        current_res = self.sequence[index]
        cur_ss_bond = current_res.disulfide_bond
        if cur_ss_bond is None or not cur_ss_bond.is_intra_sequence:
            return None
        res_pair = cur_ss_bond.res_pair
        for res in res_pair:
            if res != current_res:
                return res
        # Res is deleted
        return None

[docs]    def clearAllCaching(self):
        self._invalidateLigandContacts()
        self.invalidateMaxMinBFactor()

[docs]    def getNumAnnValues(self, ann):
        if not ann.multi_value:
            return 1
        if ann in (self.ANNOTATION_TYPES.binding_sites,
                   self.ANNOTATION_TYPES.kinase_conservation):
            return len(self.ligands)
        elif ann is self.ANNOTATION_TYPES.domains:
            return len(self.domains) if self.domains else 0
        else:
            raise ValueError(f"The multi-row annotation {ann} lacks a "
                             "definition for how many rows it takes up.")


[docs]class SequenceAnnotations(AbstractSequenceAnnotations):
    """
    Knows how to annotate a single-chain sequence

    Annotations can be set at the level of the sequence as a whole, or be per
    sequence element annotations. If an attribute is accessed on the
    SequenceAnnotations object, the attribute is first looked for on the object
    and if not found is assumed to be a per sequence element annotation. If the
    elements in the sequence lack the attribute, an AttributeError will be
    raised.
    """

    def _getSequenceLengthAnnotation(self, annotation_name):
        annotation = [
            None if elem.is_gap else getattr(elem, annotation_name)
            for elem in self.sequence
        ]
        return annotation

    def _setSequenceLengthAnnotation(self, annotation_name, values):
        self._lengthCheck(values)
        for el, val in zip(self.sequence, values):
            if val is not None:
                setattr(el, annotation_name, val)

    def _lengthCheck(self, values):
        if len(values) != len(self.sequence):
            raise ValueError("Values supplied don't match the sequence length")

    def __getattr__(self, attr):
        """
        Returns a list of annotations, one for each residue in the sequence.
        """
        return self._getSequenceLengthAnnotation(attr)


[docs]class ProteinSequenceAnnotations(AbstractProteinSequenceAnnotationsMixin,
                                 SequenceAnnotations):
    """
    Knows how to annotate a ProteinSequence
    """

    annotationInvalidated = QtCore.pyqtSignal(object)
    invalidatedLigandContacts = QtCore.pyqtSignal()
    invalidatedMaxMinBFactor = QtCore.pyqtSignal()

    @enum.unique
    class ANNOTATION_TYPES(_SequenceAnnotationEnum):
        # the enum values must not be changed for backwards compatibility.
        # declaration order determines the order in which the annotations are
        # displayed.
        pairwise_constraints = 1
        alignment_set = 2
        resnum = 3
        rescode = 4
        disulfide_bonds = 5
        helix_propensity = 6
        beta_strand_propensity = 7
        turn_propensity = 8
        helix_termination_tendency = 9
        exposure_tendency = 10
        steric_group = 11
        side_chain_chem = 12
        hydrophobicity = 13
        isoelectric_point = 14
        b_factor = 15
        window_hydrophobicity = 16
        window_isoelectric_point = 17
        secondary_structure = 18
        domains = 20
        antibody_cdr = 21
        sasa = 22
        pfam = 23
        pred_disulfide_bonds = 24
        pred_secondary_structure = 25
        pred_accessibility = 26
        pred_disordered = 27
        pred_domain_arr = 28
        proximity_constraints = 29
        binding_sites = 19
        kinase_conservation = 31
        kinase_features = 30
        gpcr_segment = 32
        gpcr_generic_number = 33

    ANNOTATION_TYPES = util.enum_speedup(ANNOTATION_TYPES)

    RES_PROPENSITY_ANNOTATIONS = {
        ANNOTATION_TYPES.beta_strand_propensity,
        ANNOTATION_TYPES.exposure_tendency, ANNOTATION_TYPES.helix_propensity,
        ANNOTATION_TYPES.helix_termination_tendency,
        ANNOTATION_TYPES.side_chain_chem, ANNOTATION_TYPES.steric_group,
        ANNOTATION_TYPES.turn_propensity
    }

    PRED_ANNOTATION_TYPES = {
        ANNOTATION_TYPES.pred_disulfide_bonds,
        ANNOTATION_TYPES.pred_secondary_structure,
        ANNOTATION_TYPES.pred_accessibility,
        ANNOTATION_TYPES.pred_disordered,
        ANNOTATION_TYPES.pred_domain_arr,
    }

    ANNOTATION_TYPES.resnum.can_expand = True
    ANNOTATION_TYPES.rescode.can_expand = True
    ANNOTATION_TYPES.helix_propensity.can_expand = True
    ANNOTATION_TYPES.beta_strand_propensity.can_expand = True
    ANNOTATION_TYPES.turn_propensity.can_expand = True
    ANNOTATION_TYPES.helix_termination_tendency.can_expand = True
    ANNOTATION_TYPES.exposure_tendency.can_expand = True
    ANNOTATION_TYPES.steric_group.can_expand = True
    ANNOTATION_TYPES.side_chain_chem.can_expand = True
    ANNOTATION_TYPES.secondary_structure.can_expand = True
    ANNOTATION_TYPES.domains.can_expand = True
    ANNOTATION_TYPES.antibody_cdr.can_expand = True
    ANNOTATION_TYPES.pfam.can_expand = True
    ANNOTATION_TYPES.pred_secondary_structure.can_expand = True
    ANNOTATION_TYPES.pred_accessibility.can_expand = True
    ANNOTATION_TYPES.pred_disordered.can_expand = True
    ANNOTATION_TYPES.pred_domain_arr.can_expand = True
    ANNOTATION_TYPES.binding_sites.can_expand = True
    # ANNOTATION_TYPES.kinase_conservation.can_expand = True  # TODO MSV-3355
    ANNOTATION_TYPES.kinase_features.can_expand = True

    ANNOTATION_TYPES.resnum.title = "Residue number"
    ANNOTATION_TYPES.resnum.tooltip = "Residue Numbers"
    ANNOTATION_TYPES.rescode.title = "Residue code"
    ANNOTATION_TYPES.disulfide_bonds.title = _disulfide_bond_title
    ANNOTATION_TYPES.disulfide_bonds.tooltip = 'Disulfide Bonds'
    ANNOTATION_TYPES.secondary_structure.title = _ssa_title
    ANNOTATION_TYPES.secondary_structure.tooltip = "Secondary Structure Assignment"
    ANNOTATION_TYPES.window_hydrophobicity.title = 'Hydrophobicity'
    ANNOTATION_TYPES.window_isoelectric_point.title = 'Isoelectric Point'
    ANNOTATION_TYPES.b_factor.requires_structure = True
    ANNOTATION_TYPES.b_factor.tooltip = 'B Factor'
    ANNOTATION_TYPES.binding_sites.requires_structure = True
    ANNOTATION_TYPES.binding_sites.multi_value = True
    ANNOTATION_TYPES.binding_sites.tooltip = "Binding Site (Ligand Contacts)"
    ANNOTATION_TYPES.kinase_conservation.title = "Kinase Binding Site Conservation"
    ANNOTATION_TYPES.kinase_conservation.requires_structure = True
    ANNOTATION_TYPES.kinase_conservation.multi_value = True
    ANNOTATION_TYPES.kinase_features.title = "Kinase Features"
    ANNOTATION_TYPES.domains.tooltip = "Domains"
    ANNOTATION_TYPES.domains.multi_value = True
    ANNOTATION_TYPES.antibody_cdr.title = "Antibody CDRs"
    ANNOTATION_TYPES.pairwise_constraints.title = "Constraints"
    ANNOTATION_TYPES.proximity_constraints.title = "Constraints"

    ANNOTATION_TYPES.pred_disulfide_bonds.title = f"{_disulfide_bond_title} (P)"
    ANNOTATION_TYPES.pred_disulfide_bonds.tooltip = "Disulfide Bonds Prediction"
    ANNOTATION_TYPES.pred_disordered.title = "Disordered (P)"
    ANNOTATION_TYPES.pred_disordered.tooltip = "Disordered Prediction"
    ANNOTATION_TYPES.pred_secondary_structure.title = f"{_ssa_title} (P)"
    ANNOTATION_TYPES.pred_secondary_structure.tooltip = "Secondary Structure Assignment Prediction"
    ANNOTATION_TYPES.pred_domain_arr.title = "Domains (P)"
    ANNOTATION_TYPES.pred_domain_arr.tooltip = "Domains Prediction"
    ANNOTATION_TYPES.pred_accessibility.title = "Accessibility (P)"
    ANNOTATION_TYPES.pred_accessibility.tooltip = "Accessibility Prediction"

    ANNOTATION_TYPES.gpcr_segment.title = "GPCR Segment"
    ANNOTATION_TYPES.gpcr_segment.tooltip = "GPCR Segment"
    ANNOTATION_TYPES.gpcr_generic_number.title = "GPCR Generic Number"
    ANNOTATION_TYPES.gpcr_generic_number.tooltip = "GPCR Generic Number"

[docs]    def __init__(self, seq):
        super().__init__(seq)
        self.from_maestro = False  # If seq has corresponding entry in Maestro.
        self.maestro_entry_id = None
        self.maestro_chain_name = None
        self.chain_id = None
        self._binding_site_residues = collections.defaultdict(set)

        self._pred_disulfide_bonds = None
        self._ligand_distance = 5
        self._cdr_finder = None
        self._hydrophobicity_window_padding = DEFAULT_WINDOW_PADDING
        self._isoelectric_point_window_padding = DEFAULT_WINDOW_PADDING

        residue_changed_slots = (self._invalidateLigandContacts,
                                 self._renumberAntibodyCDRSlot,
                                 self._invalidateSASA,
                                 self.invalidateWindowHydrophobicity,
                                 self.invalidateWindowIsoelectricPoint,
                                 self.invalidateMaxMinBFactor,
                                 self._invalidateDomains)

        for slot in residue_changed_slots:
            self.sequence.residuesChanged.connect(slot)
        self.sequence.residuesAdded.connect(self._invalidateAntibodyCDRSlot)
        self.sequence.residuesRemoved.connect(self._invalidateAntibodyCDRSlot)

[docs]    @QtCore.pyqtSlot()
    def invalidateMaxMinBFactor(self):
        super().invalidateMaxMinBFactor()
        self.invalidatedMaxMinBFactor.emit()

    @cached_property
    def window_hydrophobicity(self):
        return self._getWindowAvgAnnotation("hydrophobicity",
                                            self.hydrophobicity_window_padding)

    @property
    def hydrophobicity_window_padding(self):
        return self._hydrophobicity_window_padding

    @hydrophobicity_window_padding.setter
    def hydrophobicity_window_padding(self, value):
        self._hydrophobicity_window_padding = value
        self.invalidateWindowHydrophobicity()

    @property
    def isoelectric_point_window_padding(self):
        return self._isoelectric_point_window_padding

    @property
    def binding_site_residues(self):
        """
        Binding site residues of the sequence as a map, with key being the
        ligand name(str) and value is the set of residues(protein.Residue).
        """
        if not self._binding_site_residues:
            self._updateLigandContacts()
        return self._binding_site_residues

    @isoelectric_point_window_padding.setter
    def isoelectric_point_window_padding(self, value):
        self._isoelectric_point_window_padding = value
        self.invalidateWindowIsoelectricPoint()

[docs]    @QtCore.pyqtSlot()
    def invalidateWindowHydrophobicity(self):
        """
        Invalidate the cached window hydrophobicity data.  Note that this method
        is also called from the sequence when the window size changes.
        """
        try:
            del self.window_hydrophobicity
        except AttributeError:
            pass
        self.annotationInvalidated.emit(
            self.ANNOTATION_TYPES.window_hydrophobicity)

    @cached_property
    def window_isoelectric_point(self):
        return self._getWindowAvgAnnotation(
            "isoelectric_point", self.isoelectric_point_window_padding)

[docs]    @QtCore.pyqtSlot()
    def invalidateWindowIsoelectricPoint(self):
        """
        Invalidate the cached window isoelectric point data.  Note that this
        method is also called from the sequence when the window size changes.
        """
        try:
            del self.window_isoelectric_point
        except AttributeError:
            pass
        self.annotationInvalidated.emit(
            self.ANNOTATION_TYPES.window_isoelectric_point)

    @cached_property
    def sasa(self):
        try:
            finder = _SASAFinder(self.sequence)
        except ValueError:
            vals = [None] * len(self.sequence)
        else:
            vals = finder.getSASAs()
        return TupleWithRange(vals)

    @QtCore.pyqtSlot()
    def _invalidateSASA(self):
        """
        Invalidates the cached SASA data.

        The connected signal emits (int, int) and the slot decorator must match
        the signal, but the method does not need to have the params.
        """
        try:
            del self.sasa
        except AttributeError:
            pass
        self.annotationInvalidated.emit(self.ANNOTATION_TYPES.sasa)

    def _getWindowAvgAnnotation(self, annotation_name, window_padding):
        """
        Calculate values for the specified window averaged annotation for the
        entire sequence.

        :param annotation_name: name of annotation to average in a window
        :type annotation_name: str

        :param window_padding: number of values to average over, on both
               the left and the right; window_padding=4 gives window size of 9
        :type window_padding: int

        :return: list containing sliding window averages of annotation
            values specified by annotation_name
        :rtype: list(float or None)
        """
        anno_vals = [
            getattr(res, annotation_name) if res.is_res else None
            for res in self.sequence
        ]
        if window_padding == 0:
            return TupleWithRange(anno_vals)
        anno_vals_wo_gaps = [val for val in anno_vals if val is not None]
        avgs_wo_gaps = get_rolling_average(anno_vals_wo_gaps, window_padding)
        if not avgs_wo_gaps:
            # The sequence is smaller than the window size, so we can't
            # calculate any average values
            return TupleWithRange([None] * len(anno_vals))

        # Figure out the index of the first residue that has a fully populated
        # window (i.e. the first residue with `window_padding` non-gap residues
        # before it)
        res_count = 0
        start_index = 0
        for i, val in enumerate(anno_vals):
            if val is not None:
                res_count += 1
                if res_count == window_padding:
                    start_index = i + 1
                    break

        # copy values from avgs_wo_gaps into avgs_with_gaps, taking gap
        # locations into account
        avgs_index = 0
        avgs_with_gaps = [None] * len(anno_vals)
        for i, val in enumerate(anno_vals[start_index:], start=start_index):
            if val is not None:
                avgs_with_gaps[i] = avgs_wo_gaps[avgs_index]
                avgs_index += 1
                if avgs_index == len(avgs_wo_gaps):
                    # We've reached the last residue that has a fully poulated
                    # window.  There are exactly `window_padding` non-gap
                    # residues afer this one, which we can confirm by
                    # uncommenting the assertion below. (This should normally be
                    # left commented out for performance reasons.)
                    # assert sum(1 for val in anno_vals[i + 1:] if val
                    #            is not None) == window_padding
                    break
        return TupleWithRange(avgs_with_gaps)

[docs]    def getAntibodyCDR(self, col, scheme):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        for cdr in self.getAntibodyCDRs(scheme):
            if cdr.start <= col <= cdr.end:
                return cdr
        return AntibodyCDR(label=AntibodyCDRLabel.NotCDR, start=None, end=None)

[docs]    def getAntibodyCDRs(self, scheme):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        if not self.sequence.getGaplessLength():
            return []
        if self._cdr_finder is None:
            self._cdr_finder = _AntibodyCDRFinder(self.sequence)

        return self._cdr_finder.getCDRs(scheme)

[docs]    def isAntibodyChain(self):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        if not self.sequence.getGaplessLength():
            return []
        if self._cdr_finder is None:
            self._cdr_finder = _AntibodyCDRFinder(self.sequence)
        return self._cdr_finder.isAntibodyChain()

[docs]    def isAntibodyHeavyChain(self):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        if not self.isAntibodyChain():
            return False
        return self._cdr_finder.isAntibodyHeavyChain()

[docs]    def isAntibodyLightChain(self):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        if not self.isAntibodyChain():
            return False
        return self._cdr_finder.isAntibodyLightChain()

    @QtCore.pyqtSlot()
    def _invalidateAntibodyCDRSlot(self):
        """
        Invalidates the cached antibody CDR data.
        """
        self._invalidateAntibodyCDRs(recalculate=True)

    @QtCore.pyqtSlot()
    def _renumberAntibodyCDRSlot(self):
        """
        Renumbers the cached antibody CDR data.
        """
        self._invalidateAntibodyCDRs(recalculate=False)

    def _invalidateAntibodyCDRs(self, recalculate=True):
        """
        Invalidate the Antibody CDR cache. If we've only added gaps to the
        sequence, we only need to re-assign the CDR indexes, but
        if we add or insert residues, we need to recalculate CDRs entirely.

        :param recalculate: whether to recalculate CDRs entirely
        :type recalculate: bool
        """
        if recalculate:
            self._cdr_finder = None
        elif self._cdr_finder is not None:
            self._cdr_finder.forceIndexReassignment()

[docs]    def getSparseRescodes(self, modulo):
        codes = []
        for res in self.sequence:
            if res.is_res and res.resnum and res.resnum % modulo == 0:
                codes.append(res.rescode)
            else:
                codes.append(None)
        return codes

[docs]    def onStructureChanged(self):
        self._invalidateLigandContacts()
        self._invalidateAntibodyCDRs(recalculate=True)
        self._invalidateSASA()

[docs]    def setLigandDistance(self, distance):
        # See AbstractProteinSequenceAnnotationsMixin for method documentation
        if distance == self._ligand_distance:
            return
        self._ligand_distance = distance
        self._invalidateLigandContacts()

    @QtCore.pyqtSlot()
    def _invalidateLigandContacts(self):
        # See AbstractProteinSequenceAnnotationsMixin for method documentation
        super()._invalidateLigandContacts()
        self._binding_site_residues.clear()
        self.invalidatedLigandContacts.emit()

    def _updateLigandContacts(self):
        # See AbstractProteinSequenceAnnotationsMixin for method documentation
        try:
            contacts_finder = _LigandContactsFinder(self.sequence)
        except ValueError:
            self._ligands = []
            self._ligand_asls = []
            self._binding_sites = [[] for _ in self.sequence]
            return

        far_lig_dist = self._ligand_distance + 1
        close_lig_dist = self._ligand_distance - 1

        ligands, lig_asls = contacts_finder.getLigandsWithin(far_lig_dist)
        binding_sites = [
            [BINDING_SITE.NoContact] * len(ligands) for _ in self.sequence
        ]

        for i, lig in enumerate(ligands):
            far_idxs = contacts_finder.resIndexesNear(lig, far_lig_dist)
            close_idxs = contacts_finder.resIndexesNear(lig, close_lig_dist)

            for idx in far_idxs:
                binding_sites[idx][i] = BINDING_SITE.FarContact
                self._binding_site_residues[lig].add(self.sequence[idx])
            for idx in close_idxs:
                binding_sites[idx][i] = BINDING_SITE.CloseContact
                self._binding_site_residues[lig].add(self.sequence[idx])

        self._ligand_asls = lig_asls
        self._binding_sites = binding_sites
        need_signal = (self._ligands != ligands)
        self._ligands = ligands
        if need_signal:
            self.titleChanged.emit()

    @QtCore.pyqtSlot()
    def _invalidateDomains(self):
        """
        Clears the domain annotation cache.
        """
        if self._domains is not None:
            self._domains = None
            self.invalidatedDomains.emit()

[docs]    def parseDomains(self, filename):
        """
        Parse XML file from UniProt database to get domain information.

        :param filename: the XML file to parse for domain information
        :type filename: str

        :return: a list of the domains (names) for the sequence in order
        :rtype: list(str)
        """
        domains = []
        record = SeqIO.read(filename, "uniprot-xml")
        residues_by_resnum = collections.defaultdict(list)
        for res in self.sequence.residues():
            residues_by_resnum[res.resnum].append(res)
        for feature in record.features:
            if feature.type != "domain":
                continue
            name = feature.qualifiers.get('description')
            if name is None:
                continue
            location = feature.location
            start = int(location.start)
            end = int(location.end)
            any_res_found = False
            for domain_resnum in range(start, end + 1):
                residues = residues_by_resnum.get(domain_resnum)
                if residues is None:
                    continue
                any_res_found = True
                for res in residues:
                    if res.domains is None:
                        res.domains = {name}
                    else:
                        res.domains.add(name)
            if any_res_found:
                domains.append(name)
        self._domains = domains
        self._domains_parsed = True
        self.domainsChanged.emit()
        return domains

[docs]    def resetAnnotation(self, ann):
        "Force a reset of an annotation's cache."
        if ann is self.ANNOTATION_TYPES.binding_sites:
            self._invalidateLigandContacts()
        elif ann is self.ANNOTATION_TYPES.antibody_cdr:
            self._invalidateAntibodyCDRs()
        elif ann is self.ANNOTATION_TYPES.window_hydrophobicity:
            self.invalidateWindowHydrophobicity()
        elif ann is self.ANNOTATION_TYPES.window_isoelectric_point:
            self.invalidateWindowIsoelectricPoint()
        elif ann is self.ANNOTATION_TYPES.sasa:
            self._invalidateSASA()
        elif ann is self.ANNOTATION_TYPES.domains:
            self._invalidateDomains()

[docs]    def clearAllCaching(self):
        for ann in self.ANNOTATION_TYPES:
            self.resetAnnotation(ann)

    @property
    def inscode(self):
        return self._getSequenceLengthAnnotation('inscode')

    @inscode.setter
    def inscode(self, codes):
        self._setSequenceLengthAnnotation('inscode', codes)

    @property
    def resnum(self):
        return self._getSequenceLengthAnnotation('resnum')

    @resnum.setter
    def resnum(self, nums):
        self._setSequenceLengthAnnotation('resnum', nums)

[docs]    def getCDRResidueList(self, scheme):
        """
        :return: List of CDR Residues.
        :rtype: List[str]
        """
        if self._cdr_finder is None:
            self._cdr_finder = _AntibodyCDRFinder(self.sequence)
        if scheme is not self._cdr_finder.scheme:
            self._cdr_finder.updateScheme(scheme)
        return self._cdr_finder.getResidueList()


[docs]class NucleicAcidSequenceAnnotations(ProteinSequenceAnnotations):
    # For the alpha release, we only have "limited support" of nucleic acids,
    # so this class is just a stub.
    # TODO (MSV-1504): Create proper nucleic acid domain objects

[docs]    def isAntibodyChain(self):
        return False


###############################################################################
# Global (Alignment) level Annotations
###############################################################################


[docs]class ProteinAlignmentAnnotations(object):
    """
    Knows how to annotate an alignment (a collection of aligned sequences)
    """

    @enum.unique
    class ANNOTATION_TYPES(_AnnotationEnum):
        # the enum values must not be changed for backwards compatibility.
        # declaration order determines the order in which the annotations are
        # displayed.
        indices = 1
        mean_hydrophobicity = 2
        mean_isoelectric_point = 3
        consensus_symbols = 4
        consensus_seq = 5
        consensus_freq = 6
        sequence_logo = 7

    ANNOTATION_TYPES = util.enum_speedup(ANNOTATION_TYPES)

    ANNOTATION_TYPES.mean_hydrophobicity.tooltip = 'Mean Hydrophobicity'
    ANNOTATION_TYPES.mean_isoelectric_point.tooltip = 'Mean Isoelectric Point'
    ANNOTATION_TYPES.consensus_symbols.tooltip = 'Consensus Symbols'
    ANNOTATION_TYPES.consensus_seq.tooltip = 'Consensus Sequence'
    ANNOTATION_TYPES.sequence_logo.tooltip = 'Sequence Logo'

    ANNOTATION_TYPES.indices.title = ""
    # avoid circular refs between alignments and annotations
    alignment = util.WeakRefAttribute()

[docs]    def __init__(self, aln):
        """
        :param aln: `alignment.Alignment`
        """
        self.alignment = aln

    def _alnMeanAnnotation(self, annotation_name):
        """
        returns: Per-column averages of annotation values specified by
        annotation_name, or 0.0 if columns includes only gaps.
        """
        columns = list(self.alignment.columns(omit_gaps=True, match_type=True))
        means = []
        for column in columns:
            values_with_missing = (
                getattr(res, annotation_name) for res in column)
            values = [val for val in values_with_missing if val is not None]
            divisor = float(len(values)) if values else 1.0
            mean = sum(values) / divisor
            means.append(mean)
        return TupleWithRange(means)

    @cached_property
    def indices(self):
        """
        A numbering of all the column indices in an alignment
        """
        return tuple(range(1, self.alignment.num_columns + 1))

    @cached_property
    def mean_hydrophobicity(self):
        """
        returns: A list of floats representing per-column averages of the
        hydrophobicity of residues in the alignment
        """
        return self._alnMeanAnnotation('hydrophobicity')

    @cached_property
    def mean_isoelectric_point(self):
        """
        returns: A list of floats representing per-column averages of the
        isoelectric point of residues in the alignment
        """
        return self._alnMeanAnnotation('isoelectric_point')

    @cached_property
    def consensus_seq(self):
        """
        Consensus sequence in the alignment. If there is more than one
        highest freq. residue in the column, save all of them.

        :return: consensus sequence
        :rtype: list(list(schrodinger.protein.residue.Residue))
        """
        seq = []
        for most_common in self.alignment.getFrequencies(normalize=False):
            if most_common:
                _, max_freq = most_common[0]
                cons = [res for res, freq in most_common if freq == max_freq]
            else:
                cons = []
            seq.append(cons)
        return seq

    @cached_property
    def consensus_freq(self):
        """
        Returns the frequency of the consensus residue in each
        alignment column as a list. Gaps are not used for calculation.

        :return: consensus residue frequencies
        :rtype: TupleWithRange(float)
        """
        consensus_only = []
        for column in self.alignment.getFrequencies():
            if column:
                freq_percentage = round(float(column[0][1] * 100), 2)
                consensus_only.append(freq_percentage)
            else:  # gap-only columns
                consensus_only.append(0.00)
        return TupleWithRange(consensus_only)

    @cached_property
    def consensus_symbols(self):
        """
        Consensus symbols in the alignment based on pre-defined residue sets,
        same as in ClustalW

        :return: consensus symbols for each alignment position
        :type: A list of ConsensusSymbol enums.
        """
        strong_sets = [
            set("STA"),
            set("NEQK"),
            set("NHQK"),
            set("NDEQ"),
            set("QHRK"),
            set("MILV"),
            set("MILF"),
            set("HY"),
            set("FYW")
        ]

        weak_sets = [
            set("CSA"),
            set("ATV"),
            set("SAG"),
            set("STNK"),
            set("STPA"),
            set("SGND"),
            set("SNDEQK"),
            set("NDEQHK"),
            set("NEQHRK"),
            set("FVLIM"),
            set("HFY")
        ]
        levels = []
        for column in self.alignment.columns(omit_gaps=True, match_type=True):
            level = Consensus.not_conserved
            if column:
                column_res = set(res.short_code for res in column)
                if len(column_res) == 1:
                    # fully conserved to a single residue
                    level = Consensus.fully_conserved
                elif any(column_res.issubset(strong) for strong in strong_sets):
                    # conserved to one of the strong sets
                    level = Consensus.strongly_conserved
                elif any(column_res.issubset(weak) for weak in weak_sets):
                    # conserved to one of the weak sets
                    level = Consensus.weakly_conserved
            levels.append(level)

        return levels

    @cached_property
    def sequence_logo(self):
        """
        Calculates normalized frequencies of individual amino acids per
        alignment position, and overall estimate of column composition
        diversity ('bits'). Bit values are weighted by the number of gaps in
        the column.

        Schneider TD, Stephens RM (1990). "Sequence Logos: A New Way to
        Display Consensus Sequences". Nucleic Acids Res 18 (20): 6097–6100.
        doi:10.1093/nar/18.20.6097

        :return: the list of bits and frequencies (in decreasing order) of the
            residues in each column of the alignment.
        :rtype: list(tuple(float, tuple(tuple(str, float))))
        """
        alignment = self.alignment
        logo = []
        num_seqs = len(alignment.getSeqsMatchingRefType())
        for (frequencies,
             column) in zip(alignment.getFrequencies(),
                            alignment.columns(omit_gaps=True, match_type=True)):
            if frequencies:
                # Calculate column uncertainty
                uncertainty = 0.00
                for (_, freq) in frequencies:
                    uncertainty -= freq * math.log(freq, 2.0)
                # Bits represent amount of information at column, weighted by
                # number of gaps.
                bit_weight = len(column) / num_seqs
                bits = (LOGO_MAX_DIVERSITY - uncertainty) * bit_weight
                entry = (bits, frequencies)
            else:  # gap-only columns
                entry = (LOGO_MAX_DIVERSITY, ())
            logo.append(entry)
        return logo

[docs]    def clearAllCaching(self):
        for attr in ("indices", "mean_hydrophobicity", "mean_isoelectric_point",
                     "consensus_seq", "consensus_freq", "consensus_symbols",
                     "sequence_logo"):
            try:
                delattr(self, attr)
            except AttributeError:
                pass


class _LigandContactsFinder(object):
    """
    An class that finds the ligands that are close to a sequence, as well as the
    residues that are close to those ligands. Note that this only makes sense
    for sequences that have structures.
    """

    seq = util.WeakRefAttribute()

    def __init__(self, seq):
        """
        :param seq: The sequence to find the ligand contacts of
        :type seq: `schrodinger.protein.sequence.Sequence`

        :raises ValueError: if the sequence has no associated structure
        """
        ct = seq.getStructure()
        if ct is None:
            raise ValueError("Ligand contacts can only be found for "
                             "a sequence with a structure.")

        self.seq = seq
        self.ct = ct
        self._all_ligands = analyze.find_ligands(self.ct)
        self._lig_name_dict = {
            self._makeLigandName(lig): lig for lig in self._all_ligands
        }
        self._res_dict = {(res.resnum, res.inscode): i
                          for (i, res) in enumerate(seq)
                          if res.is_res}

    def _makeLigandName(self, lig):
        """
        :param lig: the ligand to make a name for
        :type lig: schrodinger.structutils.analyze.Ligand

        :return: the name for the ligand
        :rtype: string
        """
        return make_ligand_name(self.ct, lig)

    def getLigandsWithin(self, dist):
        """
        Find all ligands that are within a certain distance of a sequence.

        :param dist: The distance around the sequence to search for ligands
        :type dist: int

        :return: a tuple of ligand names and a tuple of ligand ASLs
        :rtype: tuple[tuple[str], tuple[str]]
        """
        ligand_names_and_asls = []
        for lig in self._all_ligands:
            query = ("(within {dist} ({ligand_asl})) "
                     "and (protein) "
                     "and (chain. {chain_id})").format(
                         ligand_asl=lig.ligand_asl,
                         dist=dist,
                         chain_id=self.seq.structure_chain)
            if analyze.evaluate_asl(self.ct, query):
                # lig.ligand_asl can be "too compact" (e.g. `m.n 2`) so
                # construct a more human-readable ASL
                atom = self.ct.atom[lig.atom_indexes[0]]
                lig_asl = f"(res. {atom.resnum} and res.ptype {atom.pdbres})"
                ligand_names_and_asls.append(
                    (self._makeLigandName(lig), lig_asl))
        if not ligand_names_and_asls:
            return (), ()
        ligand_names_and_asls.sort()
        # Split name-ASL tuples into names and ASLs
        lig_names, lig_asls = zip(*ligand_names_and_asls)
        return lig_names, lig_asls

    def resIndexesNear(self, lig_name, dist):
        """
        Find all residues that are within a certain distance of a ligand.

        :param lig_name: the name of the ligand to search for nearby residues
        :type lig_name: string

        :param dist: the distance around the ligand to search for residues
        :type dist: int

        :return: indexes of the residues in the sequence that are close
                 to the ligand
        :rtype: set(int)
        """
        try:
            lig = self._lig_name_dict[lig_name]
        except KeyError:
            raise KeyError("Ligand {lig_name} not found on structure.".format(
                lig_name=lig_name))

        # Find atom indexes in structure within `dist` angstroms of the ligand
        # which are on the sequence's chain
        query = ("(within {dist} ({ligand_asl})) "
                 "and (protein) "
                 "and (chain. {chain_id})").format(
                     ligand_asl=lig.ligand_asl,
                     dist=dist,
                     chain_id=self.seq.structure_chain)
        nearby_atom_indexes = analyze.evaluate_asl(self.ct, query)

        # Get residue indexes in the sequence corresponding to those atoms
        res_indexes = set()
        for atom_index in nearby_atom_indexes:
            atom = self.ct.atom[atom_index]
            res_key = (atom.resnum, atom.inscode)

            # FIXME: (MSV-1371)
            # Remove this check once residue insertion/deletion is
            # synchronized between the MSV and the workspace.
            if res_key not in self._res_dict:
                continue

            res_index = self._res_dict[res_key]
            res_indexes.add(res_index)

        return res_indexes


class _AntibodyCDRFinder(object):
    """
    Class to help with finding CDRs ("complementary determining regions") in
    an antibody chain. There are different numbering schemes for identifying
    where the CDRs span. Converting between the schemes doesn't cost much time,
    but determining the where the CDRs are in the first place is rather expensive
    (~0.3s), so we have to be careful about caching things.
    """

    seq = util.WeakRefAttribute()

    def __init__(self, seq):
        """
        :param seq: The sequence to find the CDRs on
        :type seq: `schrodinger.protein.sequence.ProteinSequence`
        """
        if antibody is _MODULE_NOT_YET_IMPORTED:
            # in-function import to avoid circular import
            _delayed_antibody_import()
        if antibody is None:
            warnings.warn('Prime not installed; antibody CDRs cannot be found!')
            # define dummy attributes so that we can still access this object
            # normally, but it won't report any CDRs
            self.getCDRs = lambda scheme: []
            self.forceIndexReassignment = lambda: None
            self.isAntibodyChain = lambda: False
            return

        # normal instantiation when Prime and Clustal are present
        self.seq = seq
        self.scheme = AntibodyCDRScheme.Chothia
        self.seq_type = _SeqType(self.seq, self.scheme.name)
        self._cdrs = None

    def getCDRs(self, scheme):
        """
        Returns the antibody CDRs present on the sequence, numbered by the given
        numbering scheme.

        :param scheme: The antibody CDR numbering scheme to use
        :type scheme: `AntibodyCDRScheme` enum

        :returns: the antibody CDRs on the sequence
        :rtype: list of `AntibodyCDR` if the sequence is an antibody chain,
                otherwise an empty list
        """
        if scheme is not self.scheme:
            self.updateScheme(scheme)
        if self._cdrs is None:
            self._cdrs = self._extractCDRs()
        return self._cdrs

    def updateScheme(self, scheme):
        """
        Update the numbering scheme by asking the antibody.SeqType "backend".

        :param scheme: The antibody CDR numbering scheme to use
        :type scheme: `AntibodyCDRScheme` enum
        """
        self.seq_type.convertScheme(scheme.name)
        self.scheme = scheme
        self.forceIndexReassignment()

    def forceIndexReassignment(self):
        """
        Force a recalculation of the CDR start and end indices. This is required
        when the numbering scheme changes and when gaps are inserted/removed.
        """
        self._cdrs = None

    def _extractCDRs(self):
        """
        Extract the CDR information from the antibody.SeqType "backend" that
        calculated them. Note that the calculated indexes ignore gaps, so we
        need to translate them back into actual indexes into the sequence.

        :returns: the antibody CDRs on the sequence
        :rtype: list of `AntibodyCDR` if the sequence is an antibody chain,
                otherwise an empty list
        """
        if not self.seq:
            # SeqType doesn't create attributes for blank sequences, so we'd get
            # a traceback below
            return []
        index_map = [i for (i, res) in enumerate(self.seq) if res.is_res]
        cdr_indexes = ((index_map[start], index_map[end])
                       for start, end in self.seq_type.cdr_index)
        cdr_labels = self.seq_type.cdr_label

        return [
            AntibodyCDR(label=AntibodyCDRLabel[label], start=start, end=end)
            for label, (start, end) in zip(cdr_labels, cdr_indexes)
        ]

    def isAntibodyChain(self):
        """
        :returns: Whether the sequence is an antibody chain
        :rtype: bool
        """
        return bool(self.seq_type.cdr_label)

    def isAntibodyHeavyChain(self):
        """
        :returns: Whether the sequence is an antibody heavy chain
        :rtype: bool
        """
        return self.seq_type.isHeavyChain()

    def isAntibodyLightChain(self):
        """
        :returns: Whether the sequence is an antibody light chain
        :rtype: bool
        """
        return self.seq_type.isLightChain()

    def getResidueList(self):
        return self.seq_type.resid_list


def _prep_seq_for_prime(seq):
    """
    Convert a ProteinSequence to an uppercase gapless seq str
    """
    seq_str = str(seq).replace(seq.gap_char, "")
    seq_str = seq_str.upper()
    return seq_str


[docs]class SeqTypeMixin:
    """
    Mixin to customize antibody.SeqType for MSV2. See _delayed_antibody_import
    for class declaration.
    """

    _HEAVY_TYPES = {"heavy"}
    _LIGHT_TYPES = {"light_lambda", "light_kappa"}

[docs]    def __init__(self, seq, *args, **kwargs):
        """
        :type seq: schrodinger.protein.sequence.ProteinSequence
        """
        seq_str = _prep_seq_for_prime(seq)
        super().__init__(seq_str, *args, **kwargs)

[docs]    def isHeavyChain(self):
        return self.type in self._HEAVY_TYPES

[docs]    def isLightChain(self):
        return self.type in self._LIGHT_TYPES


class _SASAFinder(object):
    """
    Class to help with calculating the solvent-accessible surface area on each
    residue in a sequence.
    """

    seq = util.WeakRefAttribute()

    def __init__(self, seq):
        """
        :param seq: The sequence to find the residue SASAs of
        :type seq: schrodinger.protein.sequence.ProteinSequence

        :raises ValueError: if the sequence doesn't have a structure
        """
        self.seq = seq
        ct = seq.getStructure()
        if ct is None:
            raise ValueError("SASA can only be calculated for sequences "
                             "with structure")
        self.ct = ct.chain[seq.chain].extractStructure()

    def getSASAs(self):
        """
        :returns: The SASA of each residue, by index in the sequence. For
            gaps in the sequence or structureless residues, the SASA is None.
        :rtype: list of (float or None)
        """
        sasa_by_atom = analyze.calculate_sasa_by_atom(self.ct)

        # Iterating through residues in the structure may not correspond to
        # residues in the sequence (because of gaps, solvents, ligands, etc),
        # so we key each residue's sasa by residue number and insertion code.
        sasas_by_res = {}
        for res in self.ct.residue:
            key = (res.resnum, res.inscode)
            sasa = sum(sasa_by_atom[a.index - 1] for a in res.atom)
            sasas_by_res[key] = sasa

        sasas = [None] * len(self.seq)
        for i, res in enumerate(self.seq):
            if res.hasStructure():
                key = (res.resnum, res.inscode)
                sasas[i] = sasas_by_res[key]

        return sasas


[docs]class CombinedChainSequenceAnnotationMeta(msv_utils.QtDocstringWrapperMetaClass
                                         ):
    """
    The metaclass for `CombinedChainSequenceAnnotations`.  This metaclass
    automatically wraps getters for all sequence annotations.
    """

    def __new__(metacls,
                cls,
                bases,
                classdict,
                *,
                wraps=None,
                cached_annotations=(),
                wrapped_properties=()):
        """
        See Python documentation for positional argument documentation.

        :param wraps: The `SequenceAnnotations` subclass to wrap.  If None, no
            wrapping will be done.  (This allows for subclassing of the wrapper
            class.)
        :type wraps: type or None

        :param cached_annotations: Names of annotations that should be wrapped
            as a `cached_property` and should return a `TupleWithRange` (which
            caches the min and max values).  This list will be stored in the
            newly created class as `_cached_annotations`.
        :type cached_annotations: tuple(str)

        :param wrapped_properties: Names of properties of `wraps` that should be
            wrapped.  Values will always be fetched from the first chain of the
            sequence.
        :type wrapped_properties: tuple(str)
        """
        if wraps is not None:
            # automatically wrap all annotations unless there's already an
            # attribute of the same name (such as binding_sites)
            for anno in wraps.ANNOTATION_TYPES:
                if (anno.name not in classdict and
                        not any(anno.name in dir(base) for base in bases)):
                    classdict[anno.name] = metacls._createAnnotationWrapper(
                        anno.name, cached_annotations)
            classdict["_cached_annotations"] = cached_annotations
            for prop in wrapped_properties:
                classdict[prop] = metacls._wrapAnnotationProperty(prop)
            # automatically wrap all public class constants
            for name in dir(wraps):
                if name.isupper() and not name[0] == "_":
                    classdict[name] = getattr(wraps, name)
        return super().__new__(metacls, cls, bases, classdict, wraps=wraps)

    @staticmethod
    def _createAnnotationWrapper(anno_name, cached_annotations):
        """
        Create a wrapper for the specified sequence annotation.

        :param anno_name: The name of the annotation to wrap.
        :type anno_name: str

        :param cached_annotations: Names of annotations that should be wrapped
            as a `cached_property` and should return a `TupleWithRange` (which
            caches the min and max values).
        :type cached_annotations: tuple(str)

        :return: The wrapper property.
        :rtype: property
        """
        if anno_name in cached_annotations:

            def annotationWrapper(self):
                anno_vals = []
                for chain in self.sequence._seqs:
                    anno_vals.extend(getattr(chain.annotations, anno_name))
                return TupleWithRange(anno_vals)

            property_type = cached_property
        else:

            def annotationWrapper(self):
                anno_vals = []
                for chain in self.sequence._seqs:
                    anno_vals.extend(getattr(chain.annotations, anno_name))
                return anno_vals

            property_type = property
        annotationWrapper.__name__ = anno_name
        annotationWrapper = property_type(annotationWrapper)
        return annotationWrapper

    @staticmethod
    def _wrapAnnotationProperty(name):
        """
        Wrap the specified property.

        :param name: The name of the property to wrap.
        :type name: str

        :return: The wrapped property
        :rtype: property
        """

        def getter(self):
            return getattr(self.sequence.chains[0].annotations, name)

        def setter(self, value):
            for chain in self.sequence.chains:
                setattr(chain.annotations, name, value)

        def deleter(self):
            for chain in self.sequence.chains:
                delattr(chain.annotations, name)

        return property(getter, setter, deleter)


[docs]class CombinedChainProteinSequenceAnnotations(
        AbstractProteinSequenceAnnotationsMixin,
        AbstractSequenceAnnotations,
        metaclass=CombinedChainSequenceAnnotationMeta,
        wraps=ProteinSequenceAnnotations,
        cached_annotations=("sasa", "window_hydrophobicity",
                            "window_isoelectric_point", "domains"),
        wrapped_properties=("hydrophobicity_window_padding",
                            "isoelectric_point_window_padding", "domains")):
    """
    Sequence annotations for a `sequence.CombinedChainProteinSequence`.
    Annotations will be fetched from the `ProteinSequenceAnnotations` objects
    for each split-chain sequence.
    """
    sequence = util.WeakRefAttribute()

[docs]    def __init__(self, seq):
        """
        :param seq: The sequence to store annotations for.
        :type seq: sequence.CombinedChainProteinSequence
        """
        super().__init__(seq)
        for chain in seq.chains:
            self._connectSignals(chain)

    def _getSignalsAndSlots(self, chain):
        return [
            (chain.annotations.titleChanged, self.titleChanged),
            (chain.annotations.invalidatedLigandContacts,
             self._invalidateLigandContacts),
            (chain.annotations.invalidatedMaxMinBFactor,
             self.invalidateMaxMinBFactor),
            (chain.annotations.invalidatedDomains, self.invalidatedDomains),
            (chain.annotations.annotationInvalidated,
             self._invalidateAnnotation),
        ]

    def _connectSignals(self, chain):
        for signal, slot in self._getSignalsAndSlots(chain):
            signal.connect(slot)

    def _disconnectSignals(self, chain):
        for signal, slot in self._getSignalsAndSlots(chain):
            signal.disconnect(slot)

[docs]    def chainAdded(self, chain):
        """
        Respond to a new chain being added to the sequence.  The sequence is
        responsible for calling this method whenever a chain is added.

        :param chain: The newly added chain.
        :type chain: sequence.ProteinSequence
        """
        self._connectSignals(chain)
        self._invalidateLigandContacts()

[docs]    def chainRemoved(self, chain):
        """
        Respond to a chain being removed from the sequence.  The sequence is
        responsible for calling this method whenever a chain is removed.

        :param chain: The removed chain.
        :type chain: sequence.ProteinSequence
        """
        self._disconnectSignals(chain)
        self._invalidateLigandContacts()

    def _adjustCDR(self, cdr, offset):
        """
        Adjust the start and end residue indices in the given antibody CDR.

        :param cdr: The antibody CDR to adjust.
        :type cdr: AntibodyCDR

        :param offset: The value to offset the indices by.
        :type offset: int

        :return: The adjusted CDR.
        :rtype: AntibodyCDR
        """
        # if the label is NotCDR, both start and end will be None
        if cdr.label is AntibodyCDRLabel.NotCDR or offset == 0:
            return cdr
        else:
            return AntibodyCDR(cdr.label, cdr.start + offset, cdr.end + offset)

[docs]    def getAntibodyCDR(self, col, scheme):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        seq, seq_index, offset = self.sequence.indexToSeqAndIndex(col)
        cdr = seq.annotations.getAntibodyCDR(seq_index, scheme)
        return self._adjustCDR(cdr, offset)

[docs]    def getAntibodyCDRs(self, scheme):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        cdrs = []
        for chain, offset in zip(self.sequence.chains,
                                 self.sequence.chain_offsets):
            for cur_cdr in chain.annotations.getAntibodyCDRs(scheme):
                cdrs.append(self._adjustCDR(cur_cdr, offset))
        return cdrs

[docs]    def isAntibodyChain(self):
        # See AbstractProteinSequenceAnnotationMixin for method documentation
        return any(chain.annotations.isAntibodyChain()
                   for chain in self.sequence.chains)

[docs]    def setLigandDistance(self, distance):
        # See AbstractProteinSequenceAnnotationsMixin for method documentation
        for chain in self.sequence.chains:
            chain.annotations.setLigandDistance(distance)

    def _updateLigandContacts(self):
        # See AbstractProteinSequenceAnnotationsMixin for method documentation
        all_ligs = set(
            itertools.chain.from_iterable(
                chain.annotations.ligands for chain in self.sequence.chains))
        self._ligands = sorted(all_ligs)
        if not all_ligs:
            self._binding_sites = [[] for _ in range(len(self.sequence))]
            return
        lig_data = {lig: [] for lig in all_ligs}
        for chain in self.sequence.chains:
            remaining_ligs = set(all_ligs)
            cur_ann = chain.annotations
            binding_site_per_lig = list(zip(*cur_ann.binding_sites))
            for lig, binding_site in zip(cur_ann.ligands, binding_site_per_lig):
                remaining_ligs.remove(lig)
                lig_data[lig].extend(binding_site)
            for lig in remaining_ligs:
                lig_data[lig].extend([BINDING_SITE.NoContact] * len(chain))

        binding_sites = [lig_data[lig] for lig in self._ligands]
        self._binding_sites = list(zip(*binding_sites))

    def _invalidateAnnotation(self, ann):
        """
        Invalidate any caching for the specified annotation.

        :param ann: The annotation to invalidate caching for.
        :type ann: ProteinSequenceAnnotations.ANNOTATION_TYPES
        """
        try:
            delattr(self, ann.name)
        except AttributeError:
            pass

[docs]    def clearAllCaching(self):
        super().clearAllCaching()
        # _cached_annotations is populated in the metaclass based on the
        # cached_annotations argument on the class declaration line
        for ann in self._cached_annotations:
            self._invalidateAnnotation(ann)


[docs]def make_ligand_name_atom(ct, atom_index):
    """
    Make a unique, human-readable name for a ligand identified by atom index.

    :param ct: Structure the ligand belongs to
    :type ct: schrodinger.structure.Structure

    :param atom_index: the atom index of the ligand to make a name for
    :type atom_index: int

    :return: The name for the ligand
    :rtype: str
    """
    lig_atom = ct.atom[atom_index]
    return f'{lig_atom.chain}: {lig_atom.pdbres.strip()} {lig_atom.resnum:d}'


[docs]def make_ligand_name(ct, ligand):
    """
    Make a unique, human-readable name for a ligand. This name matches the
    ligand name in the structure hierarchy.

    :param ct: Structure the ligand belongs to
    :type ct: schrodinger.structure.Structure

    :param ligand: the ligand to make a name for
    :type ligand: schrodinger.structutils.analyze.Ligand

    :return: The name for the ligand
    :rtype: str
    """
    return make_ligand_name_atom(ct, ligand.atom_indexes[0])


[docs]def parse_antibody_rescode(newcode):
    """
    Extract the resnum and inscode from residue number as per the scheme.
    If the inscode is a number it will be converted to alphabet.
    eg: 'H101.1' -> '101A'.
    Residues that are outside of the numbering scheme catalog (FV) or can
    not be assigned properly, will have residue number as '-1'. eg: 'H-1'

    :param newcode: Residue code by the Antibody CDR numbering scheme.
    :type newcode: str

    :return: new residue number and insertion code.
    :rtype: tuple

    :raises KeyError: if newcode doesn't follow the expected pattern.

    """
    pattern = r'[A-Z_](-?\d+)\.?([A-Za-z]?[0-9]*)?'
    match = re.fullmatch(pattern, newcode)
    if match:
        res_num, inscode = match.groups()
        res_num = int(res_num)
        if inscode:
            # Convert number to corresponding letter: eg: 1->A, 2->B. If the
            # number > 27 inscode will be None.
            if inscode.isnumeric():
                if int(inscode) < 27:
                    inscode = chr(ord('@') + int(inscode))
                else:
                    inscode = None
                    res_num = None
        else:
            inscode = ' '
    else:
        raise KeyError(f'Unknown residue code "{newcode}"')

    return res_num, inscode