Source code for schrodinger.protein.analysis

"""
A class for diagnosing and reporting common structural problems of protein
complexes.

Usage:

instance = Report(ct)
instance.write_to_stdout()

Copyright Schrodinger, LLC. All rights reserved.

"""

import future.utils
import math
import re
import sys
from past.utils import old_div

import numpy

import schrodinger.infra.mm as mm
from schrodinger import structure
from schrodinger.infra.mmbitset import Bitset
from schrodinger.protein.analysis_params import backbone_dihedrals
from schrodinger.protein.analysis_params import ideal_bond_angles
from schrodinger.protein.analysis_params import ideal_bond_lengths
from schrodinger.protein.analysis_params import improper_torsions
from schrodinger.protein.analysis_params import planar_groups
from schrodinger.protein.analysis_params import pxp_vdwr
from schrodinger.protein.analysis_params import sidechain_dihedral_atomnames
from schrodinger.protein.analysis_params import sidechain_dihedrals
from schrodinger.structutils import analyze

VDWR_MMAT = 0
VDWR_MMAT_H = 0
VDWR_PXP = 1

use_gfactors = False  #: Use gfactors instead of deviations.
kT = 0.5925  #: Value of kT at room temperature.

DISALLOWED = " disallowed"
DASH = "-"


[docs]class Report: """ A class to calculate properties of proteins. To use this class in a script to compute a set of data:: reporter = Report(struct, ['SIDECHAIN DIHEDRALS', 'GAMMA BFACTORS']) dihedrals = reporter.get_set('SIDECHAIN DIHEDRALS') for point in dihedrals.points: resnum = point.descriptor.split()[1] resid = point.descriptor.split(':')[0] + ':' + resnum for val in point.values: try: true_value = float(val) except ValueError: # Some angles will have '-' if they aren't defined for # this residue type continue do_something_with_chi_angle(true_value) """
[docs] def __init__(self, ct, sets_to_run=('ALL',)): """ Create a Report instance. :type ct: `Structure<schrodinger.structure.Structure>` :param ct: The structure this Report operates on :type sets_to_run: list :param sets_to_run: Either ['ALL'] (default) or one or more of valid sets. Valid sets to compute: - 'STERIC CLASHES' - 'PRIMEX STERIC CLASHES' - 'BOND LENGTHS' - 'BOND ANGLES' - 'BACKBONE DIHEDRALS' - 'SIDECHAIN DIHEDRALS' - 'GFACTOR SUMMARY' - 'BFACTORS' - 'GAMMA BFACTORS' - 'PEPTIDE PLANARITY' - 'SIDECHAIN PLANARITY' - 'IMPROPER TORSIONS' - 'CHIRALITY' - 'MISSING ATOMS' """ self.data = [] self.table_names = [] self.table_headers = {} self.data_sets_to_run = [] self.data_set_labels = { 'STERIC CLASHES': self.steric_clash_data_set, 'PRIMEX STERIC CLASHES': self.primex_steric_clash_data_set, 'BOND LENGTHS': self.bond_length_data_set, 'BOND ANGLES': self.bond_angle_data_set, 'BACKBONE DIHEDRALS': self.backbone_dihedral_data_set, 'SIDECHAIN DIHEDRALS': self.sidechain_dihedral_data_set, 'GFACTOR SUMMARY': self.Gfactor_summary_data_set, 'BFACTORS': self.Bfactor_data_set, 'GAMMA BFACTORS': self.gamma_Bfactor_data_set, 'PEPTIDE PLANARITY': self.peptide_planarity_data_set, 'SIDECHAIN PLANARITY': self.sidechain_planarity_data_set, 'IMPROPER TORSIONS': self.improper_torsion_data_set, 'CHIRALITY': self.chirality_data_set, 'MISSING ATOMS': self.missing_atoms_data_set } if 'ALL' in sets_to_run: sets_to_run = [ 'STERIC CLASHES', # 'PRIMEX STERIC CLASHES', Too slow 'BOND LENGTHS', 'BOND ANGLES', 'BACKBONE DIHEDRALS', 'SIDECHAIN DIHEDRALS', 'GFACTOR SUMMARY', 'BFACTORS', 'GAMMA BFACTORS', 'PEPTIDE PLANARITY', 'SIDECHAIN PLANARITY', 'IMPROPER TORSIONS', 'CHIRALITY', 'MISSING ATOMS' ] need_xtal_mates = set(['STERIC CLASHES', 'PRIMEX STERIC CLASHES']) for label in sets_to_run: if label in list(self.data_set_labels): self.data_sets_to_run.append( (label, self.data_set_labels[label])) try: if need_xtal_mates.intersection(sets_to_run): mates = analyze.generate_crystal_mates(ct, radius=4.0) else: mates = [] except: self.setup_protein(ct) else: # generate_crystal_mates can return 0 mates if something goes wrong if len(mates) > 0: for atom in mates[0].atom: atom.property['i_pa_xtalatom'] = 0 for imate in range(1, len(mates)): for atom in mates[imate].atom: atom.property['i_pa_xtalatom'] = 1 mates[0].extend(mates[imate]) atoms_to_delete = analyze.evaluate_asl( mates[0], "not (fillres within 4 (atom.i_pa_xtalatom 0 ) )") mates[0].deleteAtoms(atoms_to_delete) self.setup_protein(mates[0]) else: self.setup_protein(ct) self.working_protein.noxtal_ct = ct for set_to_run in self.data_sets_to_run: set_label = set_to_run[0] set_type = set_to_run[1] curr_set = set_type(set_label) curr_set.analyze(self) self.data.append(curr_set) self.table_names.append(curr_set.title) self.table_headers[curr_set.title] = [ field for field in curr_set.fields ]
#################################################################################### ### Basic Data Structure ####################################################################################
[docs] class data_set: """ Base class for all data sets. :ivar label: the name of this data set :vartype label: str :ivar title: the name of this data set when printing out the data :vartype title: str :ivar fields: first item is the name of the `data_point` descriptor property, remaining items are the names of the items in the `data_point` values property :vartype fields: list[str] :ivar points: list of `data_point` objects :vartype points: list(data_point) :ivar summary: overall summary of the data set for the entire protein :vartype summary: str :ivar bad_points: a subset of problematic points for filtering in table and bubble plot :vartype bad_points: list(data_point) :ivar count: the number of violations, in most cases the length of `data_points`, but for 'global' or non-residue specific properties it may be just 0 (for no issues) or 1 (e.g., X-Ray check) :vartype count: int :ivar score: raw quality score, which has higher priority :vartype score: float :ivar bubble_scale: normalized scale :vartype bubble_scale: float :ivar color: bubble color used as a pylab color argument :vartype color: str :ivar area: bubble area :vartype area: float """
[docs] def __init__(self, label): self.label = label self.title = "" self.fields = [] self.points = [] self.summary = 'N/A' # the following is for bubble plot self.bad_points = [] self.count = 0 self.score = 1.234 self.bubble_scale = 0.0 self.color = '' self.area = 0.0
[docs] class data_point: """ Class that holds the data for each point in a data set :ivar descriptor: label for this point - typically the user friendly names of the atom or residues involved :vartype descriptor: str :ivar values: the values at this point - varies by subclass :vartype values: list(float or str) :ivar atoms: the atoms involved in this point :vartype atoms: list(int) """
[docs] def __init__( self, descriptor="", values=[], # noqa: M511 atoms=[]): # noqa: M511 self.descriptor = descriptor self.values = values self.atoms = atoms
[docs] def add_point(self, descriptor="", values=[], atoms=[]): # noqa: M511 """ Add a new point to the points property :type descriptor: str :param descriptor: Label for this point - typically the user-friendly names of the atom or residues involved :type values: list :param values: The values at this point - varies by subclass :type atoms: list :param atoms: The atoms involved in this point """ self.points.append(self.data_point(descriptor, values, atoms)) return
[docs] def report_data_points(self): """ Return all data points for this set in a list :rtype: list :return: list of data for each point in self.points, each item is a list whose first item is the point.descriptor and remaining items are the point.values items. """ info = [] for point in self.points: info.append([point.descriptor] + point.values) return info
[docs] def analyze(self, parent): """ Must be subclassed, this implementation does nothing :type parent: `Report` object :param parent: The Report object that this is for """ return
[docs] def report(self): """ Must be subclassed, this implementation does nothing """ return
#################################################################################################################### ### Look for steric clashes ####################################################################################################################
[docs] class steric_clash_data_set(data_set): """ Class to compute and hold data for Steric Clashes. Data point descriptor is atoms involved, values are "Distance", "Min Allowed", "Delta". Summary is N/A See parent class `data_set` for additional documenation """
[docs] def __init__(self, *args, **kwargs): Report.data_set.__init__(self, *args, **kwargs) self.parent = None self.min_hbond_ratio = 0.75 self.clash_threshold = 0.85
# Are two atoms within three bonds?
[docs] def within_three_bonds(self, protein, iatom, target_atom): """ Method to determine whether two atoms are within three bonds of each other. :type protein: Report.local_protein :param parent: The Report object that this is for :type iatom: int :param iatom: atom number of first atom :type target_atom: int :param target_atom: atom number of the second atom :rtype: bool :return: True if atoms are within 3 bonds of each other, False if not """ iatom_atoms = [] num_iatom_bonds = mm.mmct_atom_get_bond_total(protein.ct, iatom) for iatom_bond in range(1, num_iatom_bonds + 1): jatom = mm.mmct_atom_get_bond_atom(protein.ct, iatom, iatom_bond) if jatom == target_atom: return True else: num_jatom_bonds = mm.mmct_atom_get_bond_total( protein.ct, jatom) for jatom_bond in range(1, num_jatom_bonds + 1): katom = mm.mmct_atom_get_bond_atom( protein.ct, jatom, jatom_bond) if katom == target_atom: return True else: num_katom_bonds = mm.mmct_atom_get_bond_total( protein.ct, katom) for katom_bond in range(1, num_katom_bonds + 1): latom = mm.mmct_atom_get_bond_atom( protein.ct, katom, katom_bond) if latom == target_atom: return True return False
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: l{report} object :param parent: the report object that this is for """ self.title = "Steric Clashes" self.fields = [ " Atoms ", "Distance", "Min Allowed", "Delta" ] self.tooltip = ("Ratio of the interatomic distances to the sum of " "the atomic radii") self.run_analysis(parent.working_protein)
[docs] def run_analysis(self, protein): """ Iterate over the atom pairs and check record clashes :param protein: the protein :type protein: Report.local_protein """ # Get all interactions efficiently within 3.5 angstrom using a # distance cell. atoms = protein.xtal_atoms xyz = [[atom.x, atom.y, atom.z] for atom in protein.xtal_atoms] dcell = mm.mmct_create_distance_cell_xyz2(xyz, 3.5) for n, atom in enumerate(atoms): clash_list = mm.mmct_query_distance_cell( dcell, atom.x, atom.y, atom.z) size = mm.mmlist_get_size(clash_list) indices = [mm.mmlist_get(clash_list, i) for i in range(size)] indices.sort() for index in filter(lambda x: x > n, indices): self.analyze_pair(protein, atom, atoms[index]) mm.mmct_delete_distance_cell(dcell)
[docs] def analyze_pair(self, protein, atom1, atom2): """ Test a pair of atoms and record data if they clash. Order does not matter. :param protein: The protein :type protein: Report.local_protein :param atom1: One atom of the pair :type atom1: Report.local_atom :param atom2: The second atom of the pair :type atom2: Report.local_atom """ if atom1.pdbname[-1] == '*' and atom2.pdbname[-1] == '*': return distance = protein.ct.measure(atom1.index, atom2.index) reference_distance = atom1.radius + atom2.radius clash = old_div(distance, reference_distance) # Analyze potential clashes if clash > self.clash_threshold: return # Make sure it's not an H-bond. if self.check_hbond(protein, atom1, atom2, clash, distance): return # This is an expensive check, so only do it if everything else has passed. if not self.within_three_bonds(protein, atom1.index, atom2.index): clash_name = atom1.residue_name + ":" + atom1.pdbname + " - " \ + atom2.residue_name + ":" + atom2.pdbname # Assume atom indices in the normal CT are the same as in the CT with images (should be safe). view_atoms = [] for atom in [atom1, atom2]: if atom.pdbname[-1] != '*': view_atoms.append(atom.index) self.add_point(clash_name, [ distance, reference_distance, reference_distance - distance ], view_atoms)
[docs] def check_hbond(self, protein, atom1, atom2, clash=None, distance=None, require_hydrogen=True): """ Test and atom pair to see if they can be considered a hydrogen bond. The presence of a H-bond makes permissible atom proximity that would normally be considered a clash. Atom order does not matter. :param protein: The protein :type protein: Report.local_protein :param atom1: atom 1 :type atom1: Report.local_atom :param atom2: atom 2 :type atom2: Report.local_atom :param clash: Pre-computed clash ratio :type clash: float or None :param distance: Pre-computed distance :type distance: float or None :param require_hydrogen: Whether an intervening hydrogen must be found to qualify as an H-bond. :type require_hydrogen: bool """ # Only consider non-carbon heavy atoms if atom1.atomic_number in [1, 6] or atom2.atomic_number in [1, 6]: return False if distance is None: distance = protein.ct.measure(atom1.index, atom2.index) if clash is None: reference_distance = atom1.radius + atom2.radius clash = old_div(distance, reference_distance) if clash > 1: return False if clash < self.min_hbond_ratio: return False if not require_hydrogen: return True if self.find_hbond_hydrogen(protein, atom1, atom2, distance): return True if self.find_hbond_hydrogen(protein, atom2, atom1, distance): return True return False
[docs] def find_hbond_hydrogen(self, protein, donor, acceptor, distance=None): """ Locate a hydrogen that is bound to the donor that is closer to the acceptor than the donor is. :param protein: the protein :type protein: Report.local_protein :param donor: donor atom :type donor: Report.local_atom :param acceptor: acceptor atom :type acceptor: Report.local_atom :param distance: pre-computed distance between donor and acceptor :type distance: float or None """ if distance is None: distance = protein.ct.measure(donor.index, acceptor.index) for bonded_atom in protein.ct.atom[donor.index].bonded_atoms: if bonded_atom.atomic_number == 1: h_dist = protein.ct.measure(bonded_atom, acceptor.index) if h_dist < distance: return bonded_atom return None
#################################################################################################################### ### Look for steric clashes - PrimeX version ####################################################################################################################
[docs] class primex_steric_clash_data_set(steric_clash_data_set): """ A subclass of steric_clash_data_set that computes clashes using a different set of criteria used by PrimeX Polish. """
[docs] def __init__(self, *args, **kwargs): Report.steric_clash_data_set.__init__(self, *args, **kwargs) self.min_hbond_ratio = 0.1 self.clash_threshold = 0.8
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: l{report} object :param parent: the report object that this is for """ self.title = "PrimeX Steric Clashes" self.fields = [ " Atoms ", "Distance", "Min Allowed", "Delta", "Severe" ] self.tooltip = ("Ratio of the interatomic distances to the sum of " "the atomic radii - PrimeX version") ct = parent.working_protein.ct primex_working_protein = parent.make_local_protein( ct, True, VDWR_PXP) self.run_analysis(primex_working_protein)
[docs] def check_hbond(self, protein, atom1, atom2, clash=None, distance=None): """ Runs a number of quick checks first before calling the super class' method. See the super class for more information. """ # Check for H-bonding directly via H if (atom1.atomic_number == 1 and atom2.atomic_number not in [1, 6] or atom1.atomic_number not in [1, 6] and atom2.atomic_number == 1): if clash > self.min_hbond_ratio: return True # N and O never clash if (atom1.atomic_number == 7 and atom2.atomic_number == 8 or atom1.atomic_number == 8 and atom2.atomic_number == 7): return True # Non-backbone N never clashes with N # Non-backbone O never clashes with O a1backbone = (len(atom1.pdbname.split()[0]) == 1) a2backbone = (len(atom2.pdbname.split()[0]) == 1) if (atom1.atomic_number == atom2.atomic_number == 7 or atom1.atomic_number == atom2.atomic_number == 8): if not a1backbone or not a2backbone: return True # Run the regular h-bond test super = Report.steric_clash_data_set return super.check_hbond(self, protein, atom1, atom2, clash, distance, False)
[docs] def add_point(self, descriptor, values, atoms): """ Makes changes to how the output data is recorded relative to the super class. """ distance = values[0] reference_distance = values[1] clash = old_div(distance, reference_distance) if clash <= 0.7: severe = 'Severe' else: severe = '' values.append(severe) super = Report.steric_clash_data_set super.add_point(self, descriptor, values, atoms)
#################################################################################################################### ### Measure bond lengths ####################################################################################################################
[docs] class bond_length_data_set(data_set): """ Class to compute and hold data for Bond Length Deviations Data point descriptor is atoms involved, values are "Deviation" Summary is the RMS the deviations. See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Bond Length Deviations" self.tooltip = ("Deviation from the ideal value derived from Engh " "and Huber") if use_gfactors: self.fields = ["Bond", "G-Factor"] else: self.fields = ["Bond", "Deviation"] self.summary = 0.0 num_total = 0 for atom in parent.working_protein.atoms: pdbres = mm.mmct_atom_get_pdbres(parent.working_protein.ct, atom.index) if pdbres not in list(ideal_bond_lengths): continue if atom.pdbname in list(ideal_bond_lengths[pdbres]): num_bond = mm.mmct_atom_get_bond_total( parent.working_protein.ct, atom.index) for ibond in range(1, num_bond + 1): bound_atom = mm.mmct_atom_get_bond_atom( parent.working_protein.ct, atom.index, ibond) bound_atom_pdbname = mm.mmct_atom_get_pdbname( parent.working_protein.ct, bound_atom) if bound_atom_pdbname in list( ideal_bond_lengths[pdbres][atom.pdbname]): bond_name = atom.residue_name + ":" + atom.pdbname + " - " + bound_atom_pdbname bond_length = parent.working_protein.ct.measure( atom.index, bound_atom) deviation = abs( bond_length - ideal_bond_lengths[pdbres][ atom.pdbname][bound_atom_pdbname][1]) if use_gfactors: coefficient = ideal_bond_lengths[pdbres][ atom.pdbname][bound_atom_pdbname][0] gfactor = -coefficient * deviation**2 / kT self.add_point(bond_name, [gfactor], [atom.index, bound_atom]) else: self.add_point(bond_name, [deviation], [atom.index, bound_atom]) self.summary += deviation**2 num_total += 1 if num_total > 0: self.summary = math.sqrt(old_div(self.summary, float(num_total))) return
#################################################################################################################### ### Measure bond angles ####################################################################################################################
[docs] class bond_angle_data_set(data_set): """ Class to compute and hold data for Bond Angle Deviations Data point descriptor is atoms involved, values are "Deviation" Summary is the RMS of the deviations. See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Bond Angle Deviations" self.tooltip = ("Deviation from the ideal value derived from Engh " "and Huber") if use_gfactors: self.fields = ["Angle", "G-Factor"] else: self.fields = ["Angle", "Deviation"] self.summary = 0.0 num_total = 0 for atom in parent.working_protein.atoms: pdbres = mm.mmct_atom_get_pdbres(parent.working_protein.ct, atom.index) if pdbres not in list(ideal_bond_lengths): continue if atom.pdbname in list(ideal_bond_angles[pdbres]): num_ibond = mm.mmct_atom_get_bond_total( parent.working_protein.ct, atom.index) for ibond in range(1, num_ibond + 1): bound_atom_1 = mm.mmct_atom_get_bond_atom( parent.working_protein.ct, atom.index, ibond) bound_atom_1_pdbname = mm.mmct_atom_get_pdbname( parent.working_protein.ct, bound_atom_1) if bound_atom_1_pdbname in list( ideal_bond_angles[pdbres][atom.pdbname]): num_jbond = mm.mmct_atom_get_bond_total( parent.working_protein.ct, bound_atom_1) for jbond in range(1, num_jbond + 1): bound_atom_2 = mm.mmct_atom_get_bond_atom( parent.working_protein.ct, bound_atom_1, jbond) bound_atom_2_pdbname = mm.mmct_atom_get_pdbname( parent.working_protein.ct, bound_atom_2) if bound_atom_2_pdbname in list( ideal_bond_angles[pdbres][atom.pdbname] [bound_atom_1_pdbname]): angle_name = atom.residue_name + ":" + atom.pdbname + " - " + bound_atom_1_pdbname + " - " + bound_atom_2_pdbname bond_angle = parent.working_protein.ct.measure( atom.index, bound_atom_1, bound_atom_2) deviation = abs( bond_angle - ideal_bond_angles[pdbres][ atom.pdbname][bound_atom_1_pdbname] [bound_atom_2_pdbname][1]) if use_gfactors: coefficient = ideal_bond_angles[pdbres][ atom.pdbname][bound_atom_1_pdbname][ bound_atom_2_pdbname][0] deviation = numpy.pi * deviation / 180.0 gfactor = -coefficient * deviation**2 / kT self.add_point(angle_name, [gfactor], [ bound_atom_1, atom.index, bound_atom_2 ]) else: self.add_point( angle_name, [deviation], [ bound_atom_1, atom.index, bound_atom_2 ]) self.summary += deviation**2 num_total += 1 if num_total > 0: self.summary = math.sqrt(old_div(self.summary, float(num_total))) return
#################################################################################################################### ### Backbone dihedrals ####################################################################################################################
[docs] class backbone_dihedral_data_set(data_set): """ Class to compute and hold data for Backbone Dihedrals Data point descriptor is the residue involved, values are "Phi", "Psi", "G-Factor" Summary is N/A See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Backbone Dihedrals" self.tooltip = ( "Log-probability (G-Factor) for backbone dihedrals, " "along with dihedral angles") self.fields = ["Residue", "Phi", "Psi", "G-Factor"] residues = parent.working_protein.residues if not residues: return ct = parent.working_protein.ct residue = residues[0] default_values = ['-', '-', '-'] indices = residue.get_atom_indices() self.add_point(residue.name, default_values, indices) pattern = re.compile("-180.0") for iresidue in range(1, len(residues) - 1): # Get previous, current, and next residue and their backbone # atoms presidue = residues[iresidue - 1] residue = residues[iresidue] nresidue = residues[iresidue + 1] pC = presidue.backbone[" C "] N = residue.backbone[" N "] CA = residue.backbone[" CA "] C = residue.backbone[" C "] nN = nresidue.backbone[" N "] indices = residue.get_backbone_indices() # Check if atoms exist if not (pC and N and CA and C and nN): self.add_point(residue.name, default_values, indices) continue # Check if residues are connected connected = (ct.measure(pC.index, N.index) < 3.0 and ct.measure(C.index, nN.index) < 3.0) if not connected: self.add_point(residue.name, default_values, indices) continue phi = ct.measure(pC.index, N.index, CA.index, C.index) psi = ct.measure(N.index, CA.index, C.index, nN.index) dihedral_key = str(round(phi, -1)) + ',' + str(round(psi, -1)) dihedral_key = re.sub(pattern, "180.0", dihedral_key) pdbres = mm.mmct_atom_get_pdbres(ct, CA.index) if pdbres not in list(backbone_dihedrals): pdbres = "GEN " phi = round(phi, 2) psi = round(psi, 2) dihedrals = backbone_dihedrals[pdbres] if dihedral_key in dihedrals: gfactor = math.log(dihedrals[dihedral_key]) values = [phi, psi, gfactor] else: values = [phi, psi, DISALLOWED] self.add_point(residue.name, values, indices) # Add last residue point if len(residues) > 2: residue = residues[-1] indices = residue.get_backbone_indices() self.add_point(residue.name, default_values, indices)
#################################################################################################################### ### Sidechain dihedrals ####################################################################################################################
[docs] class sidechain_dihedral_data_set(data_set): """ Class to compute and hold data for Sidechain Dihedrals Data point descriptor is the residue involved, values are "Chi1", "Chi2", "G-Factor" Summary is N/A See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Sidechain Dihedrals" self.tooltip = ( "Log-probability (G-Factor) for sidechain dihedrals, " "along with dihedral angles") self.fields = ["Residue", "Chi1", "Chi2", "G-Factor"] for residue in parent.working_protein.residues: residue_dihedrals = [] pdbres = mm.mmct_atom_get_pdbres(parent.working_protein.ct, residue.atoms[0].index) # If no sidechain atoms, pick an appropriate backbone one. target_atoms = [] if len(residue.sidechain) > 0: target_atoms = residue.get_sidechain_indices() else: target_atoms = residue.get_backbone_indices() if pdbres in sidechain_dihedrals: for dihedral in sidechain_dihedral_atomnames[pdbres]: dihedral_atoms = [] for atomname in dihedral: for atom in residue.atoms: if atom.pdbname == atomname: dihedral_atoms.append(atom.index) if len(dihedral_atoms) == 4: dihedral_value = parent.working_protein.ct.measure( dihedral_atoms[0], dihedral_atoms[1], dihedral_atoms[2], dihedral_atoms[3]) residue_dihedrals.append(dihedral_value) if len(residue_dihedrals) == len( sidechain_dihedral_atomnames[pdbres]): # Round to the nearest 10 degrees #dihedral_list = ",".join([ str(round(dihedral,1)) for dihedral in residue_dihedrals ]) residue_dihedrals = [ round(dihedral, 2) for dihedral in residue_dihedrals ] dihedral_key = ",".join([ str(round(dihedral, -1)) for dihedral in residue_dihedrals ]) pattern = re.compile("-180.0") dihedral_key = re.sub(pattern, "180.0", dihedral_key) if len(residue_dihedrals) == 1: residue_dihedrals.append(DASH) if dihedral_key in sidechain_dihedrals[pdbres]: self.add_point( residue.name, residue_dihedrals + [ math.log(sidechain_dihedrals[pdbres] [dihedral_key]) ], target_atoms) else: self.add_point(residue.name, residue_dihedrals + [DISALLOWED], target_atoms) else: self.add_point(residue.name, [DASH, DASH, DASH], target_atoms) else: self.add_point(residue.name, [DASH, DASH, DASH], target_atoms) return
#################################################################################################################### ### Summary of G-factors ####################################################################################################################
[docs] class Gfactor_summary_data_set(data_set): """ Class to compute and hold data for G-factor summaries of the Backbone and Sidechain dihedrals. Data point descriptor is the residue involved, values are "Backbone", "Sidechain", "Total" Summary is N/A See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "G-factor Summary" self.tooltip = ( "Log-probability (G-Factor) for backbone dihedrals, " "sidechain dihedrals, and the sum of the two") self.fields = ["Residue", "Backbone", "Sidechain", "Total"] atoms = {} for set in parent.data: if set.title == "Backbone Dihedrals": backbone_data = set.report_data_points() for point in set.points: atoms[point.descriptor] = point.atoms elif set.title == "Sidechain Dihedrals": sidechain_data = set.report_data_points() for iline in range(len(backbone_data)): if backbone_data[iline][3] == DISALLOWED or sidechain_data[ iline][3] == DISALLOWED: total = DISALLOWED elif sidechain_data[iline][3] == DASH: total = backbone_data[iline][3] elif backbone_data[iline][3] == DASH: total = sidechain_data[iline][3] else: total = backbone_data[iline][3] + sidechain_data[iline][3] self.add_point( backbone_data[iline][0], [backbone_data[iline][3], sidechain_data[iline][3], total], atoms[backbone_data[iline][0]]) return
#################################################################################################################### ### Calculate B-factor information ####################################################################################################################
[docs] class Bfactor_data_set(data_set): """ Class to compute and hold data for B-Factors Data point descriptor is the residue involved, values are "Backbone", "BBStdDev", "Sidechain", "SCStdDev" Summary is the average B-Factor for backbone and sidechain atoms See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Average B-factors" self.tooltip = ("Average temperature factors for the backbone " "and sidechain of each residue") self.fields = [ "Residue", "Backbone", "BBStdDev", "Sidechain", "SCStdDev" ] self.summary = 0.0 num_total = 0 for residue in parent.working_protein.residues: back_n_atoms = 0 side_n_atoms = 0 avg_back_Bfactor = 0.0 avg_side_Bfactor = 0.0 for atom in future.utils.listvalues(residue.backbone): if atom is None: continue if atom.atomic_number == 1: continue avg_back_Bfactor += atom.temperature_factor self.summary += atom.temperature_factor back_n_atoms += 1 num_total += 1 for atom in future.utils.listvalues(residue.sidechain): if atom.atomic_number == 1: continue avg_side_Bfactor += atom.temperature_factor self.summary += atom.temperature_factor side_n_atoms += 1 num_total += 1 if back_n_atoms > 0: avg_back_Bfactor /= back_n_atoms if side_n_atoms > 0: avg_side_Bfactor /= side_n_atoms stddev_back_Bfactor = 0.0 stddev_side_Bfactor = 0.0 for atom in future.utils.listvalues(residue.backbone): if atom is None: continue if atom.atomic_number == 1: continue stddev_back_Bfactor += (atom.temperature_factor - avg_back_Bfactor)**2 for atom in future.utils.listvalues(residue.sidechain): if atom.atomic_number == 1: continue stddev_side_Bfactor += (atom.temperature_factor - avg_side_Bfactor)**2 if back_n_atoms > 0: stddev_back_Bfactor /= back_n_atoms if side_n_atoms > 0: stddev_side_Bfactor /= side_n_atoms self.add_point(residue.name, [ avg_back_Bfactor, stddev_back_Bfactor, avg_side_Bfactor, stddev_side_Bfactor ], residue.get_atom_indices()) if num_total > 0: self.summary = old_div(self.summary, float(num_total)) return
#################################################################################################################### ### Gamma atom B-factor ####################################################################################################################
[docs] class gamma_Bfactor_data_set(data_set): """ Class to compute and hold data for B-Factors of sidechain gamma atoms Data point descriptor is the residue involved, value is "B-Factor" Summary is the average B-Factor for gamma atoms See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Gamma Atom B-Factor" self.tooltip = ("Temperature-factor for the gamma atom in each " "residue") self.fields = ["Residue", "B-Factor"] self.summary = 0.0 num_total = 0 for residue in parent.working_protein.residues: for atom in future.utils.listvalues(residue.sidechain): if atom.pdbname[2] == "G": self.add_point(residue.name + ":" + atom.pdbname, [atom.temperature_factor], [atom.index]) self.summary += atom.temperature_factor num_total += 1 if num_total > 0: self.summary = old_div(self.summary, float(num_total)) return
#################################################################################################################### ### Calculate peptide bond planarity ####################################################################################################################
[docs] class peptide_planarity_data_set(data_set): """ Class to compute and hold data for Peptide Planarity Data point descriptor is the atoms involved, value is "Dihedral Angle" Summary is the average absolute planarity See parent class `data_set` for additional documenation """ MAX_PEPTIDE_BOND_LENGTH = 3.0
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Peptide Planarity" self.tooltip = ("RMSD of the atoms in the peptide linkage from " "the plane that minimizes the RMSD") self.fields = ["Peptide Bond", "Dihedral Angle"] self.summary = 0.0 num_total = 0 ct = parent.working_protein.ct residues = parent.working_protein.residues residue_total = len(residues) for iresidue in range(residue_total - 1): # Get backbone atoms of current and neighboring residues residue = residues[iresidue] CA_atom = residue.backbone[" CA "] C_atom = residue.backbone[" C "] nresidue = residues[iresidue + 1] nN_atom = nresidue.backbone[" N "] nCA_atom = nresidue.backbone[" CA "] atoms = [CA_atom, C_atom, nN_atom, nCA_atom] if any(atom is None for atom in atoms): continue distance = ct.measure(C_atom.index, nN_atom.index) if distance >= self.MAX_PEPTIDE_BOND_LENGTH: continue dihedral = ct.measure(CA_atom.index, C_atom.index, nN_atom.index, nCA_atom.index) descriptor = f"{residue.name:>10s} - {nresidue.name:<10s}" values = [abs(dihedral)] atoms = [C_atom.index, nN_atom.index] self.add_point(descriptor, values, atoms) self.summary += abs(dihedral) num_total += 1 if num_total > 0: self.summary = old_div(self.summary, float(num_total)) return
#################################################################################################################### ### Calculate the planarity of appropriate side chain groups ####################################################################################################################
[docs] class sidechain_planarity_data_set(data_set): """ Class to compute and hold RMS data for planarity of sidechains Data point descriptor is the residue involved, value is "RMSD From Planarity" Summary is the average RMSD deviation from planarity of sidechains See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Sidechain Planarity" self.tooltip = ("RMSD of nominally planar sidechain groups from " "the plane that minimizes the RMSD") self.fields = ["Residue", "RMSD From Planarity"] self.summary = 0.0 # Conventions: # ax + by + cz + d = 0 # OR z = mx + ny + b # # a = [ x1 y1 1 ] # [ x2 y2 1 ] # [ x3 y3 1 ] # b = [ z1 ] # [ z2 ] # [ z3 ] # # a * solution = b # # solution = [ m ] # [ n ] # [ b ] # # distance = |ax0 + by0 + cz0 + d| / sqrt( a^2 + b^2 + c^2 ) # = |mx0 + ny0 - z0 + b| / sqrt( m^2 + n^2 + 1 ) num_total = 0 for residue in parent.working_protein.residues: pdbres = mm.mmct_atom_get_pdbres(parent.working_protein.ct, residue.atoms[0].index) if pdbres in planar_groups: atoms = [] atom_indices = [] for atom in future.utils.listvalues(residue.sidechain): if atom.pdbname in planar_groups[pdbres]: atoms.append([atom.x, atom.y, atom.z]) atom_indices.append(atom.index) if len(atoms) > 3: # Try this with three different choices, in case the ring is parallel to an axis. best_msd = 1000.0 # z = ax + by + c A = numpy.array([[atom[0], atom[1], 1] for atom in atoms ]) b = numpy.array([atom[2] for atom in atoms]) solution, _, _, _ = numpy.linalg.lstsq(A, b, rcond=None) msd = 0.0 for atom in atoms: msd += old_div( (solution[0] * atom[0] + solution[1] * atom[1] - atom[2] + solution[2])**2, (solution[0]**2 + solution[1]**2 + 1)) if msd < best_msd: best_msd = msd # y = ax + bz + c A = numpy.array([[atom[0], atom[2], 1] for atom in atoms ]) b = numpy.array([atom[1] for atom in atoms]) solution, _, _, _ = numpy.linalg.lstsq(A, b, rcond=None) msd = 0.0 for atom in atoms: msd += old_div( (solution[0] * atom[0] + solution[1] * atom[2] - atom[1] + solution[2])**2, (solution[0]**2 + solution[1]**2 + 1)) if msd < best_msd: best_msd = msd # x = az + by + c A = numpy.array([[atom[2], atom[1], 1] for atom in atoms ]) b = numpy.array([atom[0] for atom in atoms]) solution, _, _, _ = numpy.linalg.lstsq(A, b, rcond=None) msd = 0.0 for atom in atoms: msd += old_div( (solution[0] * atom[2] + solution[1] * atom[1] - atom[0] + solution[2])**2, (solution[0]**2 + solution[1]**2 + 1)) if msd < best_msd: best_msd = msd # Now wrap it up. best_msd /= float(len(atoms)) rmsd = math.sqrt(best_msd) self.summary += rmsd num_total += 1 self.add_point(residue.name, [rmsd], atom_indices) if num_total > 0: self.summary = old_div(self.summary, float(num_total)) return
#################################################################################################################### ### Improper Torsions ####################################################################################################################
[docs] class improper_torsion_data_set(data_set): """ Class to compute and hold RMS data for improper torsions Data point descriptor is the residue involved, value is "RMS Deviation" Summary is the average RMSD deviation for improper torsions See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Improper Torsions" self.tooltip = ("RMSD of the improper torsion for atoms in " "nominally planar sidechains") self.fields = ["Residue", "RMS Deviation"] self.summary = 0.0 num_total = 0 for residue in parent.working_protein.residues: residue_dihedrals = [] pdbres = mm.mmct_atom_get_pdbres(parent.working_protein.ct, residue.atoms[0].index) if pdbres in improper_torsions: rms_deviation = 0.0 num_found = 0 for dihedral in improper_torsions[pdbres]: dihedral_atoms = [] for atomname in dihedral: for atom in residue.atoms: if atom.pdbname == atomname: dihedral_atoms.append(atom.index) if len(dihedral_atoms) == 4: dihedral_value = parent.working_protein.ct.measure( dihedral_atoms[0], dihedral_atoms[1], dihedral_atoms[2], dihedral_atoms[3]) rms_deviation += (180.0 - abs(dihedral_value))**2 self.summary += (180.0 - abs(dihedral_value)) num_found += 1 num_total += 1 if num_found > 0: rms_deviation = math.sqrt( old_div(rms_deviation, float(num_found))) self.add_point(residue.name, [rms_deviation], residue.get_sidechain_indices()) else: self.add_point(residue.name, [DASH], residue.get_sidechain_indices()) else: self.add_point(residue.name, [DASH], residue.get_sidechain_indices()) if num_total > 0: self.summary = old_div(self.summary, float(num_total)) return
#################################################################################################################### ### Calculate the CA stereochemistry ####################################################################################################################
[docs] class chirality_data_set(data_set): """ Class to compute and hold C-alpha chirality data Data point descriptor is the residue involved, value is C-alpha "Chirality" Summary is N/A See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "C-alpha Stereochemistry" self.tooltip = ("L/D chirality of the C-alpha atoms") self.fields = ["Residue", "Chirality"] for residue in parent.working_protein.residues: if " CB " in list(residue.sidechain): try: CA_N = [ residue.backbone[" N "].x - residue.backbone[" CA "].x, residue.backbone[" N "].y - residue.backbone[" CA "].y, residue.backbone[" N "].z - residue.backbone[" CA "].z ] CA_C = [ residue.backbone[" C "].x - residue.backbone[" CA "].x, residue.backbone[" C "].y - residue.backbone[" CA "].y, residue.backbone[" C "].z - residue.backbone[" CA "].z ] CA_CB = [ residue.sidechain[" CB "].x - residue.backbone[" CA "].x, residue.sidechain[" CB "].y - residue.backbone[" CA "].y, residue.sidechain[" CB "].z - residue.backbone[" CA "].z ] backbone_cross = [ CA_N[1] * CA_C[2] - CA_N[2] * CA_C[1], CA_N[2] * CA_C[0] - CA_N[0] * CA_C[2], CA_N[0] * CA_C[1] - CA_N[1] * CA_C[0] ] dot_product = backbone_cross[0] * CA_CB[ 0] + backbone_cross[1] * CA_CB[1] + backbone_cross[ 2] * CA_CB[2] if dot_product > 0.0: self.add_point(residue.name, ["L"], [residue.backbone[" CA "].index]) else: self.add_point(residue.name, ["D"], [residue.backbone[" CA "].index]) except: pass return
#################################################################################################################### ### Find residues with missing atoms ####################################################################################################################
[docs] class missing_atoms_data_set(data_set): """ Class to compute and hold information on missing atoms Data point descriptor is the residue involved, value is nothing Summary is N/A See parent class `data_set` for additional documenation """
[docs] def analyze(self, parent): """ compute and store the data for this class :type parent: `report` object :param parent: the report object that this is for """ self.title = "Missing Atoms" self.tooltip = ("Residues for which atoms are missing from the " "structure") self.fields = ["Residue"] resnames = [] resatoms = [] iter_res = structure.get_residues_unsorted( parent.working_protein.noxtal_ct) for residue in iter_res: resnames.append(parent.get_residue_name(residue.atom[1])) resatoms.append([int(atom) for atom in residue.atom]) mm.mmpdb_initialize(mm.error_handler) bs = Bitset( mm.mmpdb_set_incomplete_res_bs( parent.working_protein.noxtal_ct)) mm.mmpdb_terminate() for i in bs: self.add_point(resnames[i - 1], values=[], atoms=resatoms[i - 1]) return
#################################################################################### ### For setting up the protein #################################################################################### # Store atom information locally, for speed.
[docs] class local_atom: """ Private class used to store atom information locally for speed """
[docs] def __init__(self): self.index = 0 self.atomic_number = 0 self.atom_type = 0 self.radius = 0.0 self.pdbname = "" self.x = 0.0 self.y = 0.0 self.z = 0.0 self.temperature_factor = 0.0 return
# Group atoms into residues, for ease in cycling over them.
[docs] class local_residue: """ Private class used to store residue information locally for convenience """
[docs] def __init__(self): self.name = "" self.backbone = {" N ": None, \ " CA ": None, \ " C ": None, \ " O ": None, \ " CH3": None} self.sidechain = {} self.atoms = [] return
[docs] def get_backbone_indices(self): backbone_atoms = [] for atom in future.utils.listvalues(self.backbone): if atom is not None: backbone_atoms.append(atom.index) return backbone_atoms
[docs] def get_sidechain_indices(self): return [ atom.index for atom in future.utils.listvalues(self.sidechain) ]
[docs] def get_atom_indices(self): return [atom.index for atom in self.atoms]
# The local copy of the entire structure.
[docs] class local_protein: """ Private class used to store protein information locally for convenience """
[docs] def __init__(self): self.atoms = [] self.xtal_atoms = [] self.residues = [] self.ct = None self.noxtal_ct = None return
[docs] def get_vdw_radius(self, atom, vdwr_mode=VDWR_MMAT_H): if vdwr_mode == VDWR_MMAT: return mm.mmat_get_vdw_radius(atom.atom_type) if vdwr_mode == VDWR_MMAT_H: if atom.atomic_number == 1: return 1.1 else: return mm.mmat_get_vdw_radius(atom.atom_type) if vdwr_mode == VDWR_PXP: return pxp_vdwr[atom.atomic_number] raise ValueError('Unknown value for Van der Waals radius mode: %d' % vdwr_mode)
[docs] def setup_protein(self, ct): """ An internal method used to set up the protein for Report calculations. This method should be considered a private method of the Report class and need not be explicitly called by the user/calling script. :type ct: `Structure<schrodinger.structure.Structure>` :param ct: The structure this method operates on """ self.working_protein = self.make_local_protein(ct)
[docs] def make_local_protein(self, ct, keep_hydrogens=False, vdwr_mode=VDWR_MMAT_H): """ :param ct: The structure this method operates on :type ct: `Structure<schrodinger.structure.Structure>` :param keep_hydrogens: Whether to include hydrogens in the local protein :type keep_hydrogens: bool :param vdwr_mode: which VDWR source should be used :type vdwr_mode: int """ working_protein = self.local_protein() working_protein.ct = ct for iatom in range(1, mm.mmct_ct_get_atom_total(ct) + 1): if 'i_pa_xtalatom' in ct.atom[iatom].property and ct.atom[ iatom].property['i_pa_xtalatom'] == 1: xtal = True else: xtal = False iatom_name = mm.mmct_atom_get_pdbname(ct, iatom) iatom_residue = mm.mmct_atom_get_pdbres(ct, iatom) if iatom_name == " CA " or (iatom_name == " CH3" and iatom_residue == "ACE "): residue_name = self.get_residue_name(iatom, working_protein) residue_atoms = ct.getResidueAtoms(iatom) new_residue = self.local_residue() new_residue.name = residue_name for residue_atom in residue_atoms: # Not interested in hydrogens. if not keep_hydrogens and residue_atom.atomic_number == 1: continue new_atom = self.local_atom() new_atom.index = int(residue_atom) new_atom.atomic_number = mm.mmct_atom_get_atomic_number( ct, residue_atom) new_atom.atom_type = mm.mmct_atom_get_type(ct, residue_atom) new_atom.radius = self.get_vdw_radius(new_atom, vdwr_mode) if xtal: new_atom.pdbname = mm.mmct_atom_get_pdbname( ct, residue_atom) + ' *' else: new_atom.pdbname = mm.mmct_atom_get_pdbname( ct, residue_atom) new_atom.residue_name = residue_name new_atom.x = mm.mmct_atom_get_x(ct, residue_atom) new_atom.y = mm.mmct_atom_get_y(ct, residue_atom) new_atom.z = mm.mmct_atom_get_z(ct, residue_atom) try: new_atom.temperature_factor = mm.mmct_atom_property_get_real( ct, 'r_m_pdb_tfactor', int(residue_atom)) except: new_atom.temperature_factor = 0.0 if new_atom.pdbname in list(new_residue.backbone): new_residue.backbone[new_atom.pdbname] = new_atom else: new_residue.sidechain[new_atom.pdbname] = new_atom new_residue.atoms.append(new_atom) if not xtal: # This list should not contain symmetry atoms. working_protein.atoms.append(new_atom) # This list should contain all atoms, normal and symmetry. working_protein.xtal_atoms.append(new_atom) if not xtal: # Only keep the residue if it's a non-xtal one. working_protein.residues.append(new_residue) return working_protein
# Utility to piece together a comprehensive residue name.
[docs] def get_residue_name(self, iatom, protein=None): """ Create a residue name for the atom iatom. :type iatom: int :param iatom: the atom index to build a residue name for :rtype: str :return: A residue name of the form: Chain:PDB_residue_codeResidue_numberInsertion_code where PDB_residue_code is a 4 character field and Residue_number is a 3 character field """ if protein is None: protein = self.working_protein (resnum, inscode) = mm.mmct_atom_get_resnum(protein.ct, iatom) if inscode == " ": inscode = "" chain = mm.mmct_atom_get_chain(protein.ct, iatom) if chain == " ": chain = "_" pdbres = mm.mmct_atom_get_pdbres(protein.ct, iatom) residue = "%s:%4s%3d%s" % (chain, pdbres, resnum, inscode) return residue
[docs] def get_set(self, set_label): """ Return a single set of data that was specified using the sets_to_run parameter when the class object was created. :type set_label: str :param set_label: One of the sets specified at Report object creation time using the sets_to_run parameter. Valid sets are: - 'STERIC CLASHES' - 'BOND LENGTHS' - 'BOND ANGLES' - 'BACKBONE DIHEDRALS' - 'SIDECHAIN DIHEDRALS' - 'GFACTOR SUMMARY' - 'BFACTORS' - 'GAMMA BFACTORS' - 'PEPTIDE PLANARITY' - 'SIDECHAIN PLANARITY' - 'IMPROPER TORSIONS' - 'CHIRALITY' - 'MISSING ATOMS' If the requested set was not specified at the Report object creation time, None will be returned. """ for set in self.data: if set.label == set_label: return set return None
[docs] def write_to_stdout(self): """ Write each of the computed sets of data to the terminal """ for set in self.data: print("%-40s" % set.title) print(("%-30s" % set.fields[0]), end=' ') # CHANGED to align columns for header in set.fields[1:]: print(("%-10s" % header), end=' ') print() for point in set.points: print(("%-30s" % point.descriptor), end=' ') for value in point.values: try: float(value) except: print(("%10s" % str(value)), end=' ') else: print(("%8.2f" % value), end=' ') print() print() return
if __name__ == '__main__': inputfile_name = sys.argv[1] ct = structure.StructureReader.read(inputfile_name) prosane = Report(ct, sets_to_run=['ALL']) prosane.write_to_stdout() print() print('Summaries:') print() for set in prosane.data: print(' ' + set.title + ' : ' + str(set.summary)) print() print('Getting by label: ') print(' ' + str(prosane.get_set('IMPROPER TORSIONS').summary)) print()