Source code for schrodinger.protein.annotation

# -*- coding: utf-8 -*-
"""
Annotations for biological sequences

Copyright Schrodinger, LLC. All rights reserved.
"""

import collections
import enum
import itertools
import math
import re
import warnings
from enum import Enum
from functools import cached_property

from Bio import SeqIO

from schrodinger.application.msv import utils as msv_utils
from schrodinger.application.msv.utils import get_rolling_average
from schrodinger.infra import util
from schrodinger.models import jsonable
from schrodinger.Qt import QtCore
from schrodinger.structutils import analyze

DEFAULT_WINDOW_PADDING = 4

_MODULE_NOT_YET_IMPORTED = object()
antibody = _MODULE_NOT_YET_IMPORTED
_SeqType = None
_disulfide_bond_title = "S-S bond"
_ssa_title = "SSA"


def _delayed_antibody_import():
    """
    The antibody modules imports alignment, which imports this module.  To avoid
    a circular import, we delay importing antibody until it's needed.  (See
    `_AntibodyCDRFinder.__init__`.)
    """
    global antibody
    try:
        from schrodinger.application.prime.packages import antibody
    except ImportError:
        # Prime is not installed.  We can still create _AntibodyCDRFinder
        # instances, but they won't find any CDRs.
        antibody = None
    else:
        global _SeqType

        class _SeqType(SeqTypeMixin, antibody.SeqType):
            pass


LOGO_MAX_DIVERSITY = math.log(20.0, 2.0)


class _BindingSiteDistance(jsonable.JsonableEnum):
    d3 = 3
    d4 = 4
    d5 = 5
    d6 = 6
    d7 = 7
    d8 = 8


BindingSiteDistance = util.enum_speedup(_BindingSiteDistance)
BINDING_SITE = Enum('BINDING_SITE', ['CloseContact', 'FarContact', 'NoContact'])

AntibodyCDRLabel = Enum('AntibodyCDRLabel',
                        ['NotCDR', 'L1', 'L2', 'L3', 'H1', 'H2', 'H3'])
AntibodyCDR = collections.namedtuple('AntibodyCDR', 'label start end')
Domains = Enum('Domains', ['Domain', 'NoDomain'])

AntibodyCDRScheme = util.enum_speedup(
    jsonable.JsonableEnum(
        'AntibodyCDRScheme',
        ['Chothia', 'Kabat', 'IMGT', 'EnhancedChothia', 'AHo']))

DEFAULT_ANTIBODY_SCHEME = AntibodyCDRScheme.Kabat
ANTIBODY_CDR_ITEMS = {
        "Chothia": AntibodyCDRScheme.Chothia,
        "Enhanced Chothia": AntibodyCDRScheme.EnhancedChothia,
        "Kabat": AntibodyCDRScheme.Kabat,
        "IMGT": AntibodyCDRScheme.IMGT,
        "AHo": AntibodyCDRScheme.AHo
    } # yapf: disable


[docs]@enum.unique class KinaseConservation(jsonable.JsonableEnum): VeryLow = "Very Low" Low = "Low" Medium = "Medium" High = "High" VeryHigh = "Very High"
[docs]@enum.unique class KinaseFeatureLabel(jsonable.JsonableEnum): GLYCINE_RICH_LOOP = "Glycine Rich Loop" ALPHA_C = "Alpha-C" GATE_KEEPER = "Gate Keeper" HINGE = "Hinge" LINKER = "Linker" HRD = "HRD" CATALYTIC_LOOP = "Catalytic Loop" DFG = "DFG" ACTIVATION_LOOP = "Activation Loop" NO_FEATURE = "No Feature"
[docs]class Consensus(Enum): not_conserved = " " fully_conserved = "*" strongly_conserved = ":" weakly_conserved = "." @property def tooltip(self): return self.name.replace("_", " ")
class _AnnotationEnum(jsonable.JsonableEnum): """ An enum class with a title for use in row headers. The title is based on the enum name by default but can be overridden if desired. """ def __init__(self, _=None): # if an Enum subclass defines an init the value of the enum will be # passed to it super(_AnnotationEnum, self).__init__() self._title = None self._tooltip = '' @property def title(self): if self._title is not None: # could be a custom empty string return self._title else: return self.name.capitalize().replace("_", " ") @title.setter def title(self, value): self._title = value @property def tooltip(self): # Default is no tooltip. May override with custom tooltip. return self._tooltip @tooltip.setter def tooltip(self, value): self._tooltip = value class _SequenceAnnotationEnum(_AnnotationEnum): """ An enum class for annotations that apply to a single sequence (as opposed to global annotations that apply to the alignment as a whole). :ivar requires_structure: Does this annotation only apply to sequences that have an associated structure? :vartype requires_structure: bool :ivar multi_value: Can this annotation have multiple values? :vartype multi_value: bool :ivar can_expand: Can this annotation be used to expand selection? Should only be set to True for annotations with categorical values (e.g. enum, string, or limited int values). Annotations may need special handling in `getAnnotationValueForComparison` in sequence.py :vartype can_expand: bool """ def __init__(self, _): super(_SequenceAnnotationEnum, self).__init__() self.requires_structure = False self.multi_value = False self.can_expand = False
[docs]class TupleWithRange(tuple): @cached_property def range(self): """ The range of data contianed in this tuple. Will return a tuple of (minimum value or zero whichever is less, maximum value or zero whichever is greater). `None` values will be ignored. If there are no `None` values in this tuple, will return (0, 0). :rtype: tuple(int or float, int or float) """ values = [val for val in self if val is not None] values.append(0) return min(values), max(values)
[docs]class AbstractSequenceAnnotations(QtCore.QObject): """ A base class for single-chain and combined-chain sequence annotations :ivar titleChanged: A signal emitted after an annotation's title (row header) changes. :vartype titleChanged: `QtCore.pyqtSignal` """ titleChanged = QtCore.pyqtSignal() # avoid circular refs between sequences and annotations sequence = util.WeakRefAttribute()
[docs] def __init__(self, seq): """ :param seq: The sequence to store annotations for. :type seq: sequence.Sequence """ super().__init__() self.sequence = seq
[docs]class AbstractProteinSequenceAnnotationsMixin: domainsChanged = QtCore.pyqtSignal() invalidatedDomains = QtCore.pyqtSignal()
[docs] def __init__(self, *args, **kwargs): """ :param seq: The sequence to store annotations for. :type seq: sequence.Sequence """ super().__init__(*args, **kwargs) self._binding_sites = None self._ligands = None self._ligand_asls = None self._domains = None self._domains_parsed = False
@cached_property def max_b_factor(self): values = [val for val in self.b_factor if val is not None] return max(values) if len(values) > 0 else 0.0 @cached_property def min_b_factor(self): values = [val for val in self.b_factor if val is not None] return min(values) if len(values) > 0 else 0.0
[docs] @QtCore.pyqtSlot() def invalidateMaxMinBFactor(self): try: del self.max_b_factor except AttributeError: pass try: del self.min_b_factor except AttributeError: pass
[docs] def getAntibodyCDR(self, col, scheme): """ Returns the antibody CDR information of the col'th index in the sequence under a given antibody CDR numbering scheme. :param col: index into the sequence :type col: int :param scheme: The antibody CDR numbering scheme to use :type scheme: `AntibodyCDRScheme` :returns: Antibody CDR label, start, and end positions :rtype: `AntibodyCDR`, which is a named tuple of (`AntibodyCDRLabel`, int, int) if col is in a CDR, otherwise (AntibodyCDRLabel.NotCDR, None, None) """ raise NotImplementedError
[docs] def getAntibodyCDRs(self, scheme): """ Returns a list of antibody CDR information for the entire sequence. :param scheme: The antibody CDR numbering scheme to use :type scheme: AntibodyCDRScheme :returns: A list of Antibody CDR labels, starts, and end positions :rtype: list(AntibodyCDR) """ raise NotImplementedError
[docs] def isAntibodyChain(self): """ :returns: Whether the sequence described is an antibody chain :rtype: bool """ raise NotImplementedError
[docs] def isAntibodyHeavyChain(self): """ :returns: Whether the sequence described is an antibody heavy chain :rtype: bool """ raise NotImplementedError
[docs] def isAntibodyLightChain(self): """ :returns: Whether the sequence described is an antibody light chain :rtype: bool """ raise NotImplementedError
@property def binding_sites(self): if self._binding_sites is None: self._updateLigandContacts() return self._binding_sites @property def ligands(self): if self._ligands is None: self._updateLigandContacts() return self._ligands @property def ligand_asls(self): if self._ligand_asls is None: self._updateLigandContacts() return self._ligand_asls def _updateLigandContacts(self): """ Update the ligand contacts cache based on the sequence's structure """ raise NotImplementedError @QtCore.pyqtSlot() def _invalidateLigandContacts(self): """ Invalidates the cached ligand contact data. """ self._binding_sites = None self._ligands = None
[docs] def setLigandDistance(self, distance): """ Updates the ligand distance and invalidates the cache """ raise NotImplementedError
@property def domains(self): if self._domains is None and self._domains_parsed: seen = set() domains = [] for res in self.sequence.residues(): if res.domains is None: continue for domain_name in res.domains: if domain_name in seen: continue seen.add(domain_name) domains.append(domain_name) self._domains = domains return self._domains
[docs] def getSSBondPartner(self, index): """ Return the residue's intra-sequence disulfide bond partner, if any. If the residue is not involved in a disulfide bond, its partner has been deleted, or its partner is in another sequence, it will return None. :param index: Index of the residue to check :type index: int :return: the other Residue in the disulfide bond or None :rtype: schrodinger.protein.residue.Residue or None """ current_res = self.sequence[index] cur_ss_bond = current_res.disulfide_bond if cur_ss_bond is None or not cur_ss_bond.is_intra_sequence: return None res_pair = cur_ss_bond.res_pair for res in res_pair: if res != current_res: return res # Res is deleted return None
[docs] def clearAllCaching(self): self._invalidateLigandContacts() self.invalidateMaxMinBFactor()
[docs] def getNumAnnValues(self, ann): if not ann.multi_value: return 1 if ann in (self.ANNOTATION_TYPES.binding_sites, self.ANNOTATION_TYPES.kinase_conservation): return len(self.ligands) elif ann is self.ANNOTATION_TYPES.domains: return len(self.domains) if self.domains else 0 else: raise ValueError(f"The multi-row annotation {ann} lacks a " "definition for how many rows it takes up.")
[docs]class SequenceAnnotations(AbstractSequenceAnnotations): """ Knows how to annotate a single-chain sequence Annotations can be set at the level of the sequence as a whole, or be per sequence element annotations. If an attribute is accessed on the SequenceAnnotations object, the attribute is first looked for on the object and if not found is assumed to be a per sequence element annotation. If the elements in the sequence lack the attribute, an AttributeError will be raised. """ def _getSequenceLengthAnnotation(self, annotation_name): annotation = [ None if elem.is_gap else getattr(elem, annotation_name) for elem in self.sequence ] return annotation def _setSequenceLengthAnnotation(self, annotation_name, values): self._lengthCheck(values) for el, val in zip(self.sequence, values): if val is not None: setattr(el, annotation_name, val) def _lengthCheck(self, values): if len(values) != len(self.sequence): raise ValueError("Values supplied don't match the sequence length") def __getattr__(self, attr): """ Returns a list of annotations, one for each residue in the sequence. """ return self._getSequenceLengthAnnotation(attr)
[docs]class ProteinSequenceAnnotations(AbstractProteinSequenceAnnotationsMixin, SequenceAnnotations): """ Knows how to annotate a ProteinSequence """ annotationInvalidated = QtCore.pyqtSignal(object) invalidatedLigandContacts = QtCore.pyqtSignal() invalidatedMaxMinBFactor = QtCore.pyqtSignal() @enum.unique class ANNOTATION_TYPES(_SequenceAnnotationEnum): # the enum values must not be changed for backwards compatibility. # declaration order determines the order in which the annotations are # displayed. pairwise_constraints = 1 alignment_set = 2 resnum = 3 rescode = 4 disulfide_bonds = 5 helix_propensity = 6 beta_strand_propensity = 7 turn_propensity = 8 helix_termination_tendency = 9 exposure_tendency = 10 steric_group = 11 side_chain_chem = 12 hydrophobicity = 13 isoelectric_point = 14 b_factor = 15 window_hydrophobicity = 16 window_isoelectric_point = 17 secondary_structure = 18 domains = 20 antibody_cdr = 21 sasa = 22 pfam = 23 pred_disulfide_bonds = 24 pred_secondary_structure = 25 pred_accessibility = 26 pred_disordered = 27 pred_domain_arr = 28 proximity_constraints = 29 binding_sites = 19 kinase_conservation = 31 kinase_features = 30 gpcr_segment = 32 gpcr_generic_number = 33 ANNOTATION_TYPES = util.enum_speedup(ANNOTATION_TYPES) RES_PROPENSITY_ANNOTATIONS = { ANNOTATION_TYPES.beta_strand_propensity, ANNOTATION_TYPES.exposure_tendency, ANNOTATION_TYPES.helix_propensity, ANNOTATION_TYPES.helix_termination_tendency, ANNOTATION_TYPES.side_chain_chem, ANNOTATION_TYPES.steric_group, ANNOTATION_TYPES.turn_propensity } PRED_ANNOTATION_TYPES = { ANNOTATION_TYPES.pred_disulfide_bonds, ANNOTATION_TYPES.pred_secondary_structure, ANNOTATION_TYPES.pred_accessibility, ANNOTATION_TYPES.pred_disordered, ANNOTATION_TYPES.pred_domain_arr, } ANNOTATION_TYPES.resnum.can_expand = True ANNOTATION_TYPES.rescode.can_expand = True ANNOTATION_TYPES.helix_propensity.can_expand = True ANNOTATION_TYPES.beta_strand_propensity.can_expand = True ANNOTATION_TYPES.turn_propensity.can_expand = True ANNOTATION_TYPES.helix_termination_tendency.can_expand = True ANNOTATION_TYPES.exposure_tendency.can_expand = True ANNOTATION_TYPES.steric_group.can_expand = True ANNOTATION_TYPES.side_chain_chem.can_expand = True ANNOTATION_TYPES.secondary_structure.can_expand = True ANNOTATION_TYPES.domains.can_expand = True ANNOTATION_TYPES.antibody_cdr.can_expand = True ANNOTATION_TYPES.pfam.can_expand = True ANNOTATION_TYPES.pred_secondary_structure.can_expand = True ANNOTATION_TYPES.pred_accessibility.can_expand = True ANNOTATION_TYPES.pred_disordered.can_expand = True ANNOTATION_TYPES.pred_domain_arr.can_expand = True ANNOTATION_TYPES.binding_sites.can_expand = True # ANNOTATION_TYPES.kinase_conservation.can_expand = True # TODO MSV-3355 ANNOTATION_TYPES.kinase_features.can_expand = True ANNOTATION_TYPES.resnum.title = "Residue number" ANNOTATION_TYPES.resnum.tooltip = "Residue Numbers" ANNOTATION_TYPES.rescode.title = "Residue code" ANNOTATION_TYPES.disulfide_bonds.title = _disulfide_bond_title ANNOTATION_TYPES.disulfide_bonds.tooltip = 'Disulfide Bonds' ANNOTATION_TYPES.secondary_structure.title = _ssa_title ANNOTATION_TYPES.secondary_structure.tooltip = "Secondary Structure Assignment" ANNOTATION_TYPES.window_hydrophobicity.title = 'Hydrophobicity' ANNOTATION_TYPES.window_isoelectric_point.title = 'Isoelectric Point' ANNOTATION_TYPES.b_factor.requires_structure = True ANNOTATION_TYPES.b_factor.tooltip = 'B Factor' ANNOTATION_TYPES.binding_sites.requires_structure = True ANNOTATION_TYPES.binding_sites.multi_value = True ANNOTATION_TYPES.binding_sites.tooltip = "Binding Site (Ligand Contacts)" ANNOTATION_TYPES.kinase_conservation.title = "Kinase Binding Site Conservation" ANNOTATION_TYPES.kinase_conservation.requires_structure = True ANNOTATION_TYPES.kinase_conservation.multi_value = True ANNOTATION_TYPES.kinase_features.title = "Kinase Features" ANNOTATION_TYPES.domains.tooltip = "Domains" ANNOTATION_TYPES.domains.multi_value = True ANNOTATION_TYPES.antibody_cdr.title = "Antibody CDRs" ANNOTATION_TYPES.pairwise_constraints.title = "Constraints" ANNOTATION_TYPES.proximity_constraints.title = "Constraints" ANNOTATION_TYPES.pred_disulfide_bonds.title = f"{_disulfide_bond_title} (P)" ANNOTATION_TYPES.pred_disulfide_bonds.tooltip = "Disulfide Bonds Prediction" ANNOTATION_TYPES.pred_disordered.title = "Disordered (P)" ANNOTATION_TYPES.pred_disordered.tooltip = "Disordered Prediction" ANNOTATION_TYPES.pred_secondary_structure.title = f"{_ssa_title} (P)" ANNOTATION_TYPES.pred_secondary_structure.tooltip = "Secondary Structure Assignment Prediction" ANNOTATION_TYPES.pred_domain_arr.title = "Domains (P)" ANNOTATION_TYPES.pred_domain_arr.tooltip = "Domains Prediction" ANNOTATION_TYPES.pred_accessibility.title = "Accessibility (P)" ANNOTATION_TYPES.pred_accessibility.tooltip = "Accessibility Prediction" ANNOTATION_TYPES.gpcr_segment.title = "GPCR Segment" ANNOTATION_TYPES.gpcr_segment.tooltip = "GPCR Segment" ANNOTATION_TYPES.gpcr_generic_number.title = "GPCR Generic Number" ANNOTATION_TYPES.gpcr_generic_number.tooltip = "GPCR Generic Number"
[docs] def __init__(self, seq): super().__init__(seq) self.from_maestro = False # If seq has corresponding entry in Maestro. self.maestro_entry_id = None self.maestro_chain_name = None self.chain_id = None self._binding_site_residues = collections.defaultdict(set) self._pred_disulfide_bonds = None self._ligand_distance = 5 self._cdr_finder = None self._hydrophobicity_window_padding = DEFAULT_WINDOW_PADDING self._isoelectric_point_window_padding = DEFAULT_WINDOW_PADDING residue_changed_slots = (self._invalidateLigandContacts, self._renumberAntibodyCDRSlot, self._invalidateSASA, self.invalidateWindowHydrophobicity, self.invalidateWindowIsoelectricPoint, self.invalidateMaxMinBFactor, self._invalidateDomains) for slot in residue_changed_slots: self.sequence.residuesChanged.connect(slot) self.sequence.residuesAdded.connect(self._invalidateAntibodyCDRSlot) self.sequence.residuesRemoved.connect(self._invalidateAntibodyCDRSlot)
[docs] @QtCore.pyqtSlot() def invalidateMaxMinBFactor(self): super().invalidateMaxMinBFactor() self.invalidatedMaxMinBFactor.emit()
@cached_property def window_hydrophobicity(self): return self._getWindowAvgAnnotation("hydrophobicity", self.hydrophobicity_window_padding) @property def hydrophobicity_window_padding(self): return self._hydrophobicity_window_padding @hydrophobicity_window_padding.setter def hydrophobicity_window_padding(self, value): self._hydrophobicity_window_padding = value self.invalidateWindowHydrophobicity() @property def isoelectric_point_window_padding(self): return self._isoelectric_point_window_padding @property def binding_site_residues(self): """ Binding site residues of the sequence as a map, with key being the ligand name(str) and value is the set of residues(protein.Residue). """ if not self._binding_site_residues: self._updateLigandContacts() return self._binding_site_residues @isoelectric_point_window_padding.setter def isoelectric_point_window_padding(self, value): self._isoelectric_point_window_padding = value self.invalidateWindowIsoelectricPoint()
[docs] @QtCore.pyqtSlot() def invalidateWindowHydrophobicity(self): """ Invalidate the cached window hydrophobicity data. Note that this method is also called from the sequence when the window size changes. """ try: del self.window_hydrophobicity except AttributeError: pass self.annotationInvalidated.emit( self.ANNOTATION_TYPES.window_hydrophobicity)
@cached_property def window_isoelectric_point(self): return self._getWindowAvgAnnotation( "isoelectric_point", self.isoelectric_point_window_padding)
[docs] @QtCore.pyqtSlot() def invalidateWindowIsoelectricPoint(self): """ Invalidate the cached window isoelectric point data. Note that this method is also called from the sequence when the window size changes. """ try: del self.window_isoelectric_point except AttributeError: pass self.annotationInvalidated.emit( self.ANNOTATION_TYPES.window_isoelectric_point)
@cached_property def sasa(self): try: finder = _SASAFinder(self.sequence) except ValueError: vals = [None] * len(self.sequence) else: vals = finder.getSASAs() return TupleWithRange(vals) @QtCore.pyqtSlot() def _invalidateSASA(self): """ Invalidates the cached SASA data. The connected signal emits (int, int) and the slot decorator must match the signal, but the method does not need to have the params. """ try: del self.sasa except AttributeError: pass self.annotationInvalidated.emit(self.ANNOTATION_TYPES.sasa) def _getWindowAvgAnnotation(self, annotation_name, window_padding): """ Calculate values for the specified window averaged annotation for the entire sequence. :param annotation_name: name of annotation to average in a window :type annotation_name: str :param window_padding: number of values to average over, on both the left and the right; window_padding=4 gives window size of 9 :type window_padding: int :return: list containing sliding window averages of annotation values specified by annotation_name :rtype: list(float or None) """ anno_vals = [ getattr(res, annotation_name) if res.is_res else None for res in self.sequence ] if window_padding == 0: return TupleWithRange(anno_vals) anno_vals_wo_gaps = [val for val in anno_vals if val is not None] avgs_wo_gaps = get_rolling_average(anno_vals_wo_gaps, window_padding) if not avgs_wo_gaps: # The sequence is smaller than the window size, so we can't # calculate any average values return TupleWithRange([None] * len(anno_vals)) # Figure out the index of the first residue that has a fully populated # window (i.e. the first residue with `window_padding` non-gap residues # before it) res_count = 0 start_index = 0 for i, val in enumerate(anno_vals): if val is not None: res_count += 1 if res_count == window_padding: start_index = i + 1 break # copy values from avgs_wo_gaps into avgs_with_gaps, taking gap # locations into account avgs_index = 0 avgs_with_gaps = [None] * len(anno_vals) for i, val in enumerate(anno_vals[start_index:], start=start_index): if val is not None: avgs_with_gaps[i] = avgs_wo_gaps[avgs_index] avgs_index += 1 if avgs_index == len(avgs_wo_gaps): # We've reached the last residue that has a fully poulated # window. There are exactly `window_padding` non-gap # residues afer this one, which we can confirm by # uncommenting the assertion below. (This should normally be # left commented out for performance reasons.) # assert sum(1 for val in anno_vals[i + 1:] if val # is not None) == window_padding break return TupleWithRange(avgs_with_gaps)
[docs] def getAntibodyCDR(self, col, scheme): # See AbstractProteinSequenceAnnotationMixin for method documentation for cdr in self.getAntibodyCDRs(scheme): if cdr.start <= col <= cdr.end: return cdr return AntibodyCDR(label=AntibodyCDRLabel.NotCDR, start=None, end=None)
[docs] def getAntibodyCDRs(self, scheme): # See AbstractProteinSequenceAnnotationMixin for method documentation if not self.sequence.getGaplessLength(): return [] if self._cdr_finder is None: self._cdr_finder = _AntibodyCDRFinder(self.sequence) return self._cdr_finder.getCDRs(scheme)
[docs] def isAntibodyChain(self): # See AbstractProteinSequenceAnnotationMixin for method documentation if not self.sequence.getGaplessLength(): return [] if self._cdr_finder is None: self._cdr_finder = _AntibodyCDRFinder(self.sequence) return self._cdr_finder.isAntibodyChain()
[docs] def isAntibodyHeavyChain(self): # See AbstractProteinSequenceAnnotationMixin for method documentation if not self.isAntibodyChain(): return False return self._cdr_finder.isAntibodyHeavyChain()
[docs] def isAntibodyLightChain(self): # See AbstractProteinSequenceAnnotationMixin for method documentation if not self.isAntibodyChain(): return False return self._cdr_finder.isAntibodyLightChain()
@QtCore.pyqtSlot() def _invalidateAntibodyCDRSlot(self): """ Invalidates the cached antibody CDR data. """ self._invalidateAntibodyCDRs(recalculate=True) @QtCore.pyqtSlot() def _renumberAntibodyCDRSlot(self): """ Renumbers the cached antibody CDR data. """ self._invalidateAntibodyCDRs(recalculate=False) def _invalidateAntibodyCDRs(self, recalculate=True): """ Invalidate the Antibody CDR cache. If we've only added gaps to the sequence, we only need to re-assign the CDR indexes, but if we add or insert residues, we need to recalculate CDRs entirely. :param recalculate: whether to recalculate CDRs entirely :type recalculate: bool """ if recalculate: self._cdr_finder = None elif self._cdr_finder is not None: self._cdr_finder.forceIndexReassignment()
[docs] def getSparseRescodes(self, modulo): codes = [] for res in self.sequence: if res.is_res and res.resnum and res.resnum % modulo == 0: codes.append(res.rescode) else: codes.append(None) return codes
[docs] def onStructureChanged(self): self._invalidateLigandContacts() self._invalidateAntibodyCDRs(recalculate=True) self._invalidateSASA()
[docs] def setLigandDistance(self, distance): # See AbstractProteinSequenceAnnotationsMixin for method documentation if distance == self._ligand_distance: return self._ligand_distance = distance self._invalidateLigandContacts()
@QtCore.pyqtSlot() def _invalidateLigandContacts(self): # See AbstractProteinSequenceAnnotationsMixin for method documentation super()._invalidateLigandContacts() self._binding_site_residues.clear() self.invalidatedLigandContacts.emit() def _updateLigandContacts(self): # See AbstractProteinSequenceAnnotationsMixin for method documentation try: contacts_finder = _LigandContactsFinder(self.sequence) except ValueError: self._ligands = [] self._ligand_asls = [] self._binding_sites = [[] for _ in self.sequence] return far_lig_dist = self._ligand_distance + 1 close_lig_dist = self._ligand_distance - 1 ligands, lig_asls = contacts_finder.getLigandsWithin(far_lig_dist) binding_sites = [ [BINDING_SITE.NoContact] * len(ligands) for _ in self.sequence ] for i, lig in enumerate(ligands): far_idxs = contacts_finder.resIndexesNear(lig, far_lig_dist) close_idxs = contacts_finder.resIndexesNear(lig, close_lig_dist) for idx in far_idxs: binding_sites[idx][i] = BINDING_SITE.FarContact self._binding_site_residues[lig].add(self.sequence[idx]) for idx in close_idxs: binding_sites[idx][i] = BINDING_SITE.CloseContact self._binding_site_residues[lig].add(self.sequence[idx]) self._ligand_asls = lig_asls self._binding_sites = binding_sites need_signal = (self._ligands != ligands) self._ligands = ligands if need_signal: self.titleChanged.emit() @QtCore.pyqtSlot() def _invalidateDomains(self): """ Clears the domain annotation cache. """ if self._domains is not None: self._domains = None self.invalidatedDomains.emit()
[docs] def parseDomains(self, filename): """ Parse XML file from UniProt database to get domain information. :param filename: the XML file to parse for domain information :type filename: str :return: a list of the domains (names) for the sequence in order :rtype: list(str) """ domains = [] record = SeqIO.read(filename, "uniprot-xml") residues_by_resnum = collections.defaultdict(list) for res in self.sequence.residues(): residues_by_resnum[res.resnum].append(res) for feature in record.features: if feature.type != "domain": continue name = feature.qualifiers.get('description') if name is None: continue location = feature.location start = int(location.start) end = int(location.end) any_res_found = False for domain_resnum in range(start, end + 1): residues = residues_by_resnum.get(domain_resnum) if residues is None: continue any_res_found = True for res in residues: if res.domains is None: res.domains = {name} else: res.domains.add(name) if any_res_found: domains.append(name) self._domains = domains self._domains_parsed = True self.domainsChanged.emit() return domains
[docs] def resetAnnotation(self, ann): "Force a reset of an annotation's cache." if ann is self.ANNOTATION_TYPES.binding_sites: self._invalidateLigandContacts() elif ann is self.ANNOTATION_TYPES.antibody_cdr: self._invalidateAntibodyCDRs() elif ann is self.ANNOTATION_TYPES.window_hydrophobicity: self.invalidateWindowHydrophobicity() elif ann is self.ANNOTATION_TYPES.window_isoelectric_point: self.invalidateWindowIsoelectricPoint() elif ann is self.ANNOTATION_TYPES.sasa: self._invalidateSASA() elif ann is self.ANNOTATION_TYPES.domains: self._invalidateDomains()
[docs] def clearAllCaching(self): for ann in self.ANNOTATION_TYPES: self.resetAnnotation(ann)
@property def inscode(self): return self._getSequenceLengthAnnotation('inscode') @inscode.setter def inscode(self, codes): self._setSequenceLengthAnnotation('inscode', codes) @property def resnum(self): return self._getSequenceLengthAnnotation('resnum') @resnum.setter def resnum(self, nums): self._setSequenceLengthAnnotation('resnum', nums)
[docs] def getCDRResidueList(self, scheme): """ :return: List of CDR Residues. :rtype: List[str] """ if self._cdr_finder is None: self._cdr_finder = _AntibodyCDRFinder(self.sequence) if scheme is not self._cdr_finder.scheme: self._cdr_finder.updateScheme(scheme) return self._cdr_finder.getResidueList()
[docs]class NucleicAcidSequenceAnnotations(ProteinSequenceAnnotations): # For the alpha release, we only have "limited support" of nucleic acids, # so this class is just a stub. # TODO (MSV-1504): Create proper nucleic acid domain objects
[docs] def isAntibodyChain(self): return False
############################################################################### # Global (Alignment) level Annotations ###############################################################################
[docs]class ProteinAlignmentAnnotations(object): """ Knows how to annotate an alignment (a collection of aligned sequences) """ @enum.unique class ANNOTATION_TYPES(_AnnotationEnum): # the enum values must not be changed for backwards compatibility. # declaration order determines the order in which the annotations are # displayed. indices = 1 mean_hydrophobicity = 2 mean_isoelectric_point = 3 consensus_symbols = 4 consensus_seq = 5 consensus_freq = 6 sequence_logo = 7 ANNOTATION_TYPES = util.enum_speedup(ANNOTATION_TYPES) ANNOTATION_TYPES.mean_hydrophobicity.tooltip = 'Mean Hydrophobicity' ANNOTATION_TYPES.mean_isoelectric_point.tooltip = 'Mean Isoelectric Point' ANNOTATION_TYPES.consensus_symbols.tooltip = 'Consensus Symbols' ANNOTATION_TYPES.consensus_seq.tooltip = 'Consensus Sequence' ANNOTATION_TYPES.sequence_logo.tooltip = 'Sequence Logo' ANNOTATION_TYPES.indices.title = "" # avoid circular refs between alignments and annotations alignment = util.WeakRefAttribute()
[docs] def __init__(self, aln): """ :param aln: `alignment.Alignment` """ self.alignment = aln
def _alnMeanAnnotation(self, annotation_name): """ returns: Per-column averages of annotation values specified by annotation_name, or 0.0 if columns includes only gaps. """ columns = list(self.alignment.columns(omit_gaps=True, match_type=True)) means = [] for column in columns: values_with_missing = ( getattr(res, annotation_name) for res in column) values = [val for val in values_with_missing if val is not None] divisor = float(len(values)) if values else 1.0 mean = sum(values) / divisor means.append(mean) return TupleWithRange(means) @cached_property def indices(self): """ A numbering of all the column indices in an alignment """ return tuple(range(1, self.alignment.num_columns + 1)) @cached_property def mean_hydrophobicity(self): """ returns: A list of floats representing per-column averages of the hydrophobicity of residues in the alignment """ return self._alnMeanAnnotation('hydrophobicity') @cached_property def mean_isoelectric_point(self): """ returns: A list of floats representing per-column averages of the isoelectric point of residues in the alignment """ return self._alnMeanAnnotation('isoelectric_point') @cached_property def consensus_seq(self): """ Consensus sequence in the alignment. If there is more than one highest freq. residue in the column, save all of them. :return: consensus sequence :rtype: list(list(schrodinger.protein.residue.Residue)) """ seq = [] for most_common in self.alignment.getFrequencies(normalize=False): if most_common: _, max_freq = most_common[0] cons = [res for res, freq in most_common if freq == max_freq] else: cons = [] seq.append(cons) return seq @cached_property def consensus_freq(self): """ Returns the frequency of the consensus residue in each alignment column as a list. Gaps are not used for calculation. :return: consensus residue frequencies :rtype: TupleWithRange(float) """ consensus_only = [] for column in self.alignment.getFrequencies(): if column: freq_percentage = round(float(column[0][1] * 100), 2) consensus_only.append(freq_percentage) else: # gap-only columns consensus_only.append(0.00) return TupleWithRange(consensus_only) @cached_property def consensus_symbols(self): """ Consensus symbols in the alignment based on pre-defined residue sets, same as in ClustalW :return: consensus symbols for each alignment position :type: A list of ConsensusSymbol enums. """ strong_sets = [ set("STA"), set("NEQK"), set("NHQK"), set("NDEQ"), set("QHRK"), set("MILV"), set("MILF"), set("HY"), set("FYW") ] weak_sets = [ set("CSA"), set("ATV"), set("SAG"), set("STNK"), set("STPA"), set("SGND"), set("SNDEQK"), set("NDEQHK"), set("NEQHRK"), set("FVLIM"), set("HFY") ] levels = [] for column in self.alignment.columns(omit_gaps=True, match_type=True): level = Consensus.not_conserved if column: column_res = set(res.short_code for res in column) if len(column_res) == 1: # fully conserved to a single residue level = Consensus.fully_conserved elif any(column_res.issubset(strong) for strong in strong_sets): # conserved to one of the strong sets level = Consensus.strongly_conserved elif any(column_res.issubset(weak) for weak in weak_sets): # conserved to one of the weak sets level = Consensus.weakly_conserved levels.append(level) return levels @cached_property def sequence_logo(self): """ Calculates normalized frequencies of individual amino acids per alignment position, and overall estimate of column composition diversity ('bits'). Bit values are weighted by the number of gaps in the column. Schneider TD, Stephens RM (1990). "Sequence Logos: A New Way to Display Consensus Sequences". Nucleic Acids Res 18 (20): 6097–6100. doi:10.1093/nar/18.20.6097 :return: the list of bits and frequencies (in decreasing order) of the residues in each column of the alignment. :rtype: list(tuple(float, tuple(tuple(str, float)))) """ alignment = self.alignment logo = [] num_seqs = len(alignment.getSeqsMatchingRefType()) for (frequencies, column) in zip(alignment.getFrequencies(), alignment.columns(omit_gaps=True, match_type=True)): if frequencies: # Calculate column uncertainty uncertainty = 0.00 for (_, freq) in frequencies: uncertainty -= freq * math.log(freq, 2.0) # Bits represent amount of information at column, weighted by # number of gaps. bit_weight = len(column) / num_seqs bits = (LOGO_MAX_DIVERSITY - uncertainty) * bit_weight entry = (bits, frequencies) else: # gap-only columns entry = (LOGO_MAX_DIVERSITY, ()) logo.append(entry) return logo
[docs] def clearAllCaching(self): for attr in ("indices", "mean_hydrophobicity", "mean_isoelectric_point", "consensus_seq", "consensus_freq", "consensus_symbols", "sequence_logo"): try: delattr(self, attr) except AttributeError: pass
class _LigandContactsFinder(object): """ An class that finds the ligands that are close to a sequence, as well as the residues that are close to those ligands. Note that this only makes sense for sequences that have structures. """ seq = util.WeakRefAttribute() def __init__(self, seq): """ :param seq: The sequence to find the ligand contacts of :type seq: `schrodinger.protein.sequence.Sequence` :raises ValueError: if the sequence has no associated structure """ ct = seq.getStructure() if ct is None: raise ValueError("Ligand contacts can only be found for " "a sequence with a structure.") self.seq = seq self.ct = ct self._all_ligands = analyze.find_ligands(self.ct) self._lig_name_dict = { self._makeLigandName(lig): lig for lig in self._all_ligands } self._res_dict = {(res.resnum, res.inscode): i for (i, res) in enumerate(seq) if res.is_res} def _makeLigandName(self, lig): """ :param lig: the ligand to make a name for :type lig: schrodinger.structutils.analyze.Ligand :return: the name for the ligand :rtype: string """ return make_ligand_name(self.ct, lig) def getLigandsWithin(self, dist): """ Find all ligands that are within a certain distance of a sequence. :param dist: The distance around the sequence to search for ligands :type dist: int :return: a tuple of ligand names and a tuple of ligand ASLs :rtype: tuple[tuple[str], tuple[str]] """ ligand_names_and_asls = [] for lig in self._all_ligands: query = ("(within {dist} ({ligand_asl})) " "and (protein) " "and (chain. {chain_id})").format( ligand_asl=lig.ligand_asl, dist=dist, chain_id=self.seq.structure_chain) if analyze.evaluate_asl(self.ct, query): # lig.ligand_asl can be "too compact" (e.g. `m.n 2`) so # construct a more human-readable ASL atom = self.ct.atom[lig.atom_indexes[0]] lig_asl = f"(res. {atom.resnum} and res.ptype {atom.pdbres})" ligand_names_and_asls.append( (self._makeLigandName(lig), lig_asl)) if not ligand_names_and_asls: return (), () ligand_names_and_asls.sort() # Split name-ASL tuples into names and ASLs lig_names, lig_asls = zip(*ligand_names_and_asls) return lig_names, lig_asls def resIndexesNear(self, lig_name, dist): """ Find all residues that are within a certain distance of a ligand. :param lig_name: the name of the ligand to search for nearby residues :type lig_name: string :param dist: the distance around the ligand to search for residues :type dist: int :return: indexes of the residues in the sequence that are close to the ligand :rtype: set(int) """ try: lig = self._lig_name_dict[lig_name] except KeyError: raise KeyError("Ligand {lig_name} not found on structure.".format( lig_name=lig_name)) # Find atom indexes in structure within `dist` angstroms of the ligand # which are on the sequence's chain query = ("(within {dist} ({ligand_asl})) " "and (protein) " "and (chain. {chain_id})").format( ligand_asl=lig.ligand_asl, dist=dist, chain_id=self.seq.structure_chain) nearby_atom_indexes = analyze.evaluate_asl(self.ct, query) # Get residue indexes in the sequence corresponding to those atoms res_indexes = set() for atom_index in nearby_atom_indexes: atom = self.ct.atom[atom_index] res_key = (atom.resnum, atom.inscode) # FIXME: (MSV-1371) # Remove this check once residue insertion/deletion is # synchronized between the MSV and the workspace. if res_key not in self._res_dict: continue res_index = self._res_dict[res_key] res_indexes.add(res_index) return res_indexes class _AntibodyCDRFinder(object): """ Class to help with finding CDRs ("complementary determining regions") in an antibody chain. There are different numbering schemes for identifying where the CDRs span. Converting between the schemes doesn't cost much time, but determining the where the CDRs are in the first place is rather expensive (~0.3s), so we have to be careful about caching things. """ seq = util.WeakRefAttribute() def __init__(self, seq): """ :param seq: The sequence to find the CDRs on :type seq: `schrodinger.protein.sequence.ProteinSequence` """ if antibody is _MODULE_NOT_YET_IMPORTED: # in-function import to avoid circular import _delayed_antibody_import() if antibody is None: warnings.warn('Prime not installed; antibody CDRs cannot be found!') # define dummy attributes so that we can still access this object # normally, but it won't report any CDRs self.getCDRs = lambda scheme: [] self.forceIndexReassignment = lambda: None self.isAntibodyChain = lambda: False return # normal instantiation when Prime and Clustal are present self.seq = seq self.scheme = AntibodyCDRScheme.Chothia self.seq_type = _SeqType(self.seq, self.scheme.name) self._cdrs = None def getCDRs(self, scheme): """ Returns the antibody CDRs present on the sequence, numbered by the given numbering scheme. :param scheme: The antibody CDR numbering scheme to use :type scheme: `AntibodyCDRScheme` enum :returns: the antibody CDRs on the sequence :rtype: list of `AntibodyCDR` if the sequence is an antibody chain, otherwise an empty list """ if scheme is not self.scheme: self.updateScheme(scheme) if self._cdrs is None: self._cdrs = self._extractCDRs() return self._cdrs def updateScheme(self, scheme): """ Update the numbering scheme by asking the antibody.SeqType "backend". :param scheme: The antibody CDR numbering scheme to use :type scheme: `AntibodyCDRScheme` enum """ self.seq_type.convertScheme(scheme.name) self.scheme = scheme self.forceIndexReassignment() def forceIndexReassignment(self): """ Force a recalculation of the CDR start and end indices. This is required when the numbering scheme changes and when gaps are inserted/removed. """ self._cdrs = None def _extractCDRs(self): """ Extract the CDR information from the antibody.SeqType "backend" that calculated them. Note that the calculated indexes ignore gaps, so we need to translate them back into actual indexes into the sequence. :returns: the antibody CDRs on the sequence :rtype: list of `AntibodyCDR` if the sequence is an antibody chain, otherwise an empty list """ if not self.seq: # SeqType doesn't create attributes for blank sequences, so we'd get # a traceback below return [] index_map = [i for (i, res) in enumerate(self.seq) if res.is_res] cdr_indexes = ((index_map[start], index_map[end]) for start, end in self.seq_type.cdr_index) cdr_labels = self.seq_type.cdr_label return [ AntibodyCDR(label=AntibodyCDRLabel[label], start=start, end=end) for label, (start, end) in zip(cdr_labels, cdr_indexes) ] def isAntibodyChain(self): """ :returns: Whether the sequence is an antibody chain :rtype: bool """ return bool(self.seq_type.cdr_label) def isAntibodyHeavyChain(self): """ :returns: Whether the sequence is an antibody heavy chain :rtype: bool """ return self.seq_type.isHeavyChain() def isAntibodyLightChain(self): """ :returns: Whether the sequence is an antibody light chain :rtype: bool """ return self.seq_type.isLightChain() def getResidueList(self): return self.seq_type.resid_list def _prep_seq_for_prime(seq): """ Convert a ProteinSequence to an uppercase gapless seq str """ seq_str = str(seq).replace(seq.gap_char, "") seq_str = seq_str.upper() return seq_str
[docs]class SeqTypeMixin: """ Mixin to customize antibody.SeqType for MSV2. See _delayed_antibody_import for class declaration. """ _HEAVY_TYPES = {"heavy"} _LIGHT_TYPES = {"light_lambda", "light_kappa"}
[docs] def __init__(self, seq, *args, **kwargs): """ :type seq: schrodinger.protein.sequence.ProteinSequence """ seq_str = _prep_seq_for_prime(seq) super().__init__(seq_str, *args, **kwargs)
[docs] def isHeavyChain(self): return self.type in self._HEAVY_TYPES
[docs] def isLightChain(self): return self.type in self._LIGHT_TYPES
class _SASAFinder(object): """ Class to help with calculating the solvent-accessible surface area on each residue in a sequence. """ seq = util.WeakRefAttribute() def __init__(self, seq): """ :param seq: The sequence to find the residue SASAs of :type seq: schrodinger.protein.sequence.ProteinSequence :raises ValueError: if the sequence doesn't have a structure """ self.seq = seq ct = seq.getStructure() if ct is None: raise ValueError("SASA can only be calculated for sequences " "with structure") self.ct = ct.chain[seq.chain].extractStructure() def getSASAs(self): """ :returns: The SASA of each residue, by index in the sequence. For gaps in the sequence or structureless residues, the SASA is None. :rtype: list of (float or None) """ sasa_by_atom = analyze.calculate_sasa_by_atom(self.ct) # Iterating through residues in the structure may not correspond to # residues in the sequence (because of gaps, solvents, ligands, etc), # so we key each residue's sasa by residue number and insertion code. sasas_by_res = {} for res in self.ct.residue: key = (res.resnum, res.inscode) sasa = sum(sasa_by_atom[a.index - 1] for a in res.atom) sasas_by_res[key] = sasa sasas = [None] * len(self.seq) for i, res in enumerate(self.seq): if res.hasStructure(): key = (res.resnum, res.inscode) sasas[i] = sasas_by_res[key] return sasas
[docs]class CombinedChainSequenceAnnotationMeta(msv_utils.QtDocstringWrapperMetaClass ): """ The metaclass for `CombinedChainSequenceAnnotations`. This metaclass automatically wraps getters for all sequence annotations. """ def __new__(metacls, cls, bases, classdict, *, wraps=None, cached_annotations=(), wrapped_properties=()): """ See Python documentation for positional argument documentation. :param wraps: The `SequenceAnnotations` subclass to wrap. If None, no wrapping will be done. (This allows for subclassing of the wrapper class.) :type wraps: type or None :param cached_annotations: Names of annotations that should be wrapped as a `cached_property` and should return a `TupleWithRange` (which caches the min and max values). This list will be stored in the newly created class as `_cached_annotations`. :type cached_annotations: tuple(str) :param wrapped_properties: Names of properties of `wraps` that should be wrapped. Values will always be fetched from the first chain of the sequence. :type wrapped_properties: tuple(str) """ if wraps is not None: # automatically wrap all annotations unless there's already an # attribute of the same name (such as binding_sites) for anno in wraps.ANNOTATION_TYPES: if (anno.name not in classdict and not any(anno.name in dir(base) for base in bases)): classdict[anno.name] = metacls._createAnnotationWrapper( anno.name, cached_annotations) classdict["_cached_annotations"] = cached_annotations for prop in wrapped_properties: classdict[prop] = metacls._wrapAnnotationProperty(prop) # automatically wrap all public class constants for name in dir(wraps): if name.isupper() and not name[0] == "_": classdict[name] = getattr(wraps, name) return super().__new__(metacls, cls, bases, classdict, wraps=wraps) @staticmethod def _createAnnotationWrapper(anno_name, cached_annotations): """ Create a wrapper for the specified sequence annotation. :param anno_name: The name of the annotation to wrap. :type anno_name: str :param cached_annotations: Names of annotations that should be wrapped as a `cached_property` and should return a `TupleWithRange` (which caches the min and max values). :type cached_annotations: tuple(str) :return: The wrapper property. :rtype: property """ if anno_name in cached_annotations: def annotationWrapper(self): anno_vals = [] for chain in self.sequence._seqs: anno_vals.extend(getattr(chain.annotations, anno_name)) return TupleWithRange(anno_vals) property_type = cached_property else: def annotationWrapper(self): anno_vals = [] for chain in self.sequence._seqs: anno_vals.extend(getattr(chain.annotations, anno_name)) return anno_vals property_type = property annotationWrapper.__name__ = anno_name annotationWrapper = property_type(annotationWrapper) return annotationWrapper @staticmethod def _wrapAnnotationProperty(name): """ Wrap the specified property. :param name: The name of the property to wrap. :type name: str :return: The wrapped property :rtype: property """ def getter(self): return getattr(self.sequence.chains[0].annotations, name) def setter(self, value): for chain in self.sequence.chains: setattr(chain.annotations, name, value) def deleter(self): for chain in self.sequence.chains: delattr(chain.annotations, name) return property(getter, setter, deleter)
[docs]class CombinedChainProteinSequenceAnnotations( AbstractProteinSequenceAnnotationsMixin, AbstractSequenceAnnotations, metaclass=CombinedChainSequenceAnnotationMeta, wraps=ProteinSequenceAnnotations, cached_annotations=("sasa", "window_hydrophobicity", "window_isoelectric_point", "domains"), wrapped_properties=("hydrophobicity_window_padding", "isoelectric_point_window_padding", "domains")): """ Sequence annotations for a `sequence.CombinedChainProteinSequence`. Annotations will be fetched from the `ProteinSequenceAnnotations` objects for each split-chain sequence. """ sequence = util.WeakRefAttribute()
[docs] def __init__(self, seq): """ :param seq: The sequence to store annotations for. :type seq: sequence.CombinedChainProteinSequence """ super().__init__(seq) for chain in seq.chains: self._connectSignals(chain)
def _getSignalsAndSlots(self, chain): return [ (chain.annotations.titleChanged, self.titleChanged), (chain.annotations.invalidatedLigandContacts, self._invalidateLigandContacts), (chain.annotations.invalidatedMaxMinBFactor, self.invalidateMaxMinBFactor), (chain.annotations.invalidatedDomains, self.invalidatedDomains), (chain.annotations.annotationInvalidated, self._invalidateAnnotation), ] def _connectSignals(self, chain): for signal, slot in self._getSignalsAndSlots(chain): signal.connect(slot) def _disconnectSignals(self, chain): for signal, slot in self._getSignalsAndSlots(chain): signal.disconnect(slot)
[docs] def chainAdded(self, chain): """ Respond to a new chain being added to the sequence. The sequence is responsible for calling this method whenever a chain is added. :param chain: The newly added chain. :type chain: sequence.ProteinSequence """ self._connectSignals(chain) self._invalidateLigandContacts()
[docs] def chainRemoved(self, chain): """ Respond to a chain being removed from the sequence. The sequence is responsible for calling this method whenever a chain is removed. :param chain: The removed chain. :type chain: sequence.ProteinSequence """ self._disconnectSignals(chain) self._invalidateLigandContacts()
def _adjustCDR(self, cdr, offset): """ Adjust the start and end residue indices in the given antibody CDR. :param cdr: The antibody CDR to adjust. :type cdr: AntibodyCDR :param offset: The value to offset the indices by. :type offset: int :return: The adjusted CDR. :rtype: AntibodyCDR """ # if the label is NotCDR, both start and end will be None if cdr.label is AntibodyCDRLabel.NotCDR or offset == 0: return cdr else: return AntibodyCDR(cdr.label, cdr.start + offset, cdr.end + offset)
[docs] def getAntibodyCDR(self, col, scheme): # See AbstractProteinSequenceAnnotationMixin for method documentation seq, seq_index, offset = self.sequence.indexToSeqAndIndex(col) cdr = seq.annotations.getAntibodyCDR(seq_index, scheme) return self._adjustCDR(cdr, offset)
[docs] def getAntibodyCDRs(self, scheme): # See AbstractProteinSequenceAnnotationMixin for method documentation cdrs = [] for chain, offset in zip(self.sequence.chains, self.sequence.chain_offsets): for cur_cdr in chain.annotations.getAntibodyCDRs(scheme): cdrs.append(self._adjustCDR(cur_cdr, offset)) return cdrs
[docs] def isAntibodyChain(self): # See AbstractProteinSequenceAnnotationMixin for method documentation return any(chain.annotations.isAntibodyChain() for chain in self.sequence.chains)
[docs] def setLigandDistance(self, distance): # See AbstractProteinSequenceAnnotationsMixin for method documentation for chain in self.sequence.chains: chain.annotations.setLigandDistance(distance)
def _updateLigandContacts(self): # See AbstractProteinSequenceAnnotationsMixin for method documentation all_ligs = set( itertools.chain.from_iterable( chain.annotations.ligands for chain in self.sequence.chains)) self._ligands = sorted(all_ligs) if not all_ligs: self._binding_sites = [[] for _ in range(len(self.sequence))] return lig_data = {lig: [] for lig in all_ligs} for chain in self.sequence.chains: remaining_ligs = set(all_ligs) cur_ann = chain.annotations binding_site_per_lig = list(zip(*cur_ann.binding_sites)) for lig, binding_site in zip(cur_ann.ligands, binding_site_per_lig): remaining_ligs.remove(lig) lig_data[lig].extend(binding_site) for lig in remaining_ligs: lig_data[lig].extend([BINDING_SITE.NoContact] * len(chain)) binding_sites = [lig_data[lig] for lig in self._ligands] self._binding_sites = list(zip(*binding_sites)) def _invalidateAnnotation(self, ann): """ Invalidate any caching for the specified annotation. :param ann: The annotation to invalidate caching for. :type ann: ProteinSequenceAnnotations.ANNOTATION_TYPES """ try: delattr(self, ann.name) except AttributeError: pass
[docs] def clearAllCaching(self): super().clearAllCaching() # _cached_annotations is populated in the metaclass based on the # cached_annotations argument on the class declaration line for ann in self._cached_annotations: self._invalidateAnnotation(ann)
[docs]def make_ligand_name_atom(ct, atom_index): """ Make a unique, human-readable name for a ligand identified by atom index. :param ct: Structure the ligand belongs to :type ct: schrodinger.structure.Structure :param atom_index: the atom index of the ligand to make a name for :type atom_index: int :return: The name for the ligand :rtype: str """ lig_atom = ct.atom[atom_index] return f'{lig_atom.chain}: {lig_atom.pdbres.strip()} {lig_atom.resnum:d}'
[docs]def make_ligand_name(ct, ligand): """ Make a unique, human-readable name for a ligand. This name matches the ligand name in the structure hierarchy. :param ct: Structure the ligand belongs to :type ct: schrodinger.structure.Structure :param ligand: the ligand to make a name for :type ligand: schrodinger.structutils.analyze.Ligand :return: The name for the ligand :rtype: str """ return make_ligand_name_atom(ct, ligand.atom_indexes[0])
[docs]def parse_antibody_rescode(newcode): """ Extract the resnum and inscode from residue number as per the scheme. If the inscode is a number it will be converted to alphabet. eg: 'H101.1' -> '101A'. Residues that are outside of the numbering scheme catalog (FV) or can not be assigned properly, will have residue number as '-1'. eg: 'H-1' :param newcode: Residue code by the Antibody CDR numbering scheme. :type newcode: str :return: new residue number and insertion code. :rtype: tuple :raises KeyError: if newcode doesn't follow the expected pattern. """ pattern = r'[A-Z_](-?\d+)\.?([A-Za-z]?[0-9]*)?' match = re.fullmatch(pattern, newcode) if match: res_num, inscode = match.groups() res_num = int(res_num) if inscode: # Convert number to corresponding letter: eg: 1->A, 2->B. If the # number > 27 inscode will be None. if inscode.isnumeric(): if int(inscode) < 27: inscode = chr(ord('@') + int(inscode)) else: inscode = None res_num = None else: inscode = ' ' else: raise KeyError(f'Unknown residue code "{newcode}"') return res_num, inscode