Source code for schrodinger.test.stu.outcomes.prime_workups

import operator
import re
from past.utils import old_div

import numpy as np

from schrodinger import structure
from schrodinger.protein.analysis import Report as ProteinReport
from schrodinger.structutils import analyze
from schrodinger.structutils import measure
from schrodinger.structutils.interactions import hbond
from schrodinger.test import structurecheck


[docs]def partial_structure_reader(filename, frames_to_check=None): """ Iterator that returns the specified frames of the structures in filename {structure} By default return all frames, but if frames_to_check { list, set or anything with an "__in__" method" of integers} is set then the only frames for which their index (starting with 0) are in frames to check are returned. Note that even if a list is passed with integers out of order, the frames will always be yielded in the order that they are in the file. """ if isinstance(frames_to_check, str): frames_to_check = eval(frames_to_check) if (type(frames_to_check) == int): frames_to_check = [frames_to_check] for ict, ct in enumerate(structure.StructureReader(filename)): if (frames_to_check is not None and ict not in frames_to_check): continue yield ct
[docs]def convert_plop_name_to_asl(input_str): """ Convert a string specifying a residue or atom into an asl expresion Input should be in the form <chain>:<resnum><inscode> or <chain>:<resnum><inscore>:<pdb atom name> <chain> is the chain id or a _ or no character if space <resnum> is the residue number (possibly negative) <inscode> is the pdb insertion code <pdb atom name> The 4 letter pdb atom name. Either spaces or _ can be used for padding to get to 4 characters. Examples A:12 :23A B:-1 _:12 The output is a ASL expression stored as a string that corresponds to the same atom(s) in the input. """ atom_match = re.search(r"(.{0,1}):(-{0,1}\d+)([^:]{0,1}):(.+)", input_str) res_match = re.search(r"(.{0,1}):(-{0,1}\d+)([^:]{0,1})", input_str) if (atom_match): chain = atom_match.group(1) resnum = eval(atom_match.group(2)) inscode = atom_match.group(3) pdbatom = atom_match.group(4) pdbatom = pdbatom.replace("_", " ") elif (res_match): chain = res_match.group(1) resnum = eval(res_match.group(2)) inscode = res_match.group(3) pdbatom = None else: raise TypeError("Could not convert input: %s" % input_str) if (chain == "_" or chain == ""): chain = " " if (inscode == ""): inscode = " " if (pdbatom is None): return '(chain.name "%s" AND res.num %d AND res.inscode "%s")' % ( chain, resnum, inscode) else: return ('(chain.name "%s" AND res.num %d AND res.inscode "%s" AND ' + 'atom.ptype "%s")') % (chain, resnum, inscode, pdbatom)
[docs]def get_moved_residues(ct, ref_ct, tolerance): """ For a given ct {structure.Stucture} and ref_ct {structure.Structure} with the same number of atoms in the same order, return a sorted list with all the residue names (__str__() evaluation of their associated structure._Residue class) that contains atoms that moved more than a tolerance {float} in Angstroms. """ if (len(ct.atom) != len(ref_ct.atom)): raise RuntimeError('Input structures must have the same ' + 'number of atoms') residues_moved = set() for atom1, atom2 in zip(ct.atom, ref_ct.atom): if (measure.measure_distance(atom1, atom2) > tolerance): residues_moved.add(str(atom1.getResidue())) return sorted(list(residues_moved))
[docs]def get_phi_psi(ct): """ Return a dictionary with the key being the name of each residues and the value being a 2 member list containing the PHI and PSI angles of that residue """ output = {} rep = ProteinReport(ct, ['BACKBONE DIHEDRALS']).data[0] for point in rep.points: if (point.descriptor == '-'): continue vec = [i for i in point.values] resnum = point.descriptor.split()[1] resid = point.descriptor.split(':')[0] + ':' + resnum output[resid] = vec return output
[docs]def get_modified_backbone_angles(ct, ref_ct, tolerance): """ For a given ct {structure.Stucture} and ref_ct {structure.Structure} with the same number of atoms in the same order, return a sorted list with all the residue names (__str__() evaluation of their associated structure._Residue class) that contains atoms that moved either phi or psi angles more than tolerance {float} in degrees """ phi_psi = get_phi_psi(ct) ref_phi_psi = get_phi_psi(ref_ct) changed = set() for res in ref_phi_psi: try: delta_phi = min(abs(phi_psi[res][0] - ref_phi_psi[res][0]), abs(phi_psi[res][0] - ref_phi_psi[res][0] - 360), abs(phi_psi[res][0] - ref_phi_psi[res][0] + 360)) delta_psi = min(abs(phi_psi[res][1] - ref_phi_psi[res][1]), abs(phi_psi[res][1] - ref_phi_psi[res][1] + 360), abs(phi_psi[res][1] - ref_phi_psi[res][1] - 360)) except TypeError: continue if (delta_phi > tolerance or delta_psi > tolerance): changed.add(res) return list(changed)
[docs]def check_moved_residues(filename, reference_fn, moved_residues, tolerance=0.01, check_backbone_dihedrals=False, frames_to_check=None): """ STU workup that will check to see if a list of moved residues { string the residue names (__str__() evaluation of the structure._Residue class) separated by spaces} corresponds to the list of all residue that have an an atom that moved more than tolerance {float} between any of the frames_to_check in filename (see partial_structure_reader) and the first structure frame in reference_fn. Returns True if the lists match, False if they dont if check_bacbone_dihedrals{logical} is True then check use the residues that either their phi or psi angle moved by more than tolerance degrees are used instead of distance in cartesian space """ ref_ct = structure.Structure.read(reference_fn) moved_residues_list = moved_residues.split(" ") errors = [] for ct in partial_structure_reader(filename, frames_to_check): if (check_backbone_dihedrals): this_moved_residues_list = get_modified_backbone_angles( ct, ref_ct, tolerance) else: this_moved_residues_list = get_moved_residues(ct, ref_ct, tolerance) should_have_moved = set(moved_residues_list) - \ set(this_moved_residues_list) should_not_have_moved = set(this_moved_residues_list) - \ set(moved_residues_list) if (should_have_moved): errors.append("ERROR: %s Residues did not moved %s" % (ct.title, " ".join(list(should_have_moved)))) if (should_not_have_moved): errors.append("ERROR: %s Residues moved %s" % (ct.title, " ".join(list(should_not_have_moved)))) if errors: raise AssertionError('\n'.join(errors)) return True
[docs]def check_protein_health(filename, frames_to_check=None, steric_delta=0.3, bond_length_deviation=0.15, protein_health_kwargs=None): """ Use the featurs in structurecheck to identify potential problems in the structure. By default this will find any instances where there is a steric clash of more than 30% of the vdW radius (set by steric delta {float}) or any bond length deviations of more than 0.15 angstroms (set by bond_length_deviation {float}) The number of allowable clashes, bond length deviations and other structual properties can be set by passing a dictionary into protein_health_kwargs {dict} as decirbed in the help for structurecheck.ProteinReportCheck. frames_to_check is descirbed above Note that protein_health_kwargs needs to have a dictionary passed to it to be useful as a STU test. Currently STU does not allow this. """ if (protein_health_kwargs is None): protein_health_kwargs = {'STERIC CLASHES': 0, 'BOND LENGTHS': 0} for ct in partial_structure_reader(filename, frames_to_check): prc = structurecheck.ProteinReportCheck( ct, steric_delta=steric_delta, bond_length_deviation=bond_length_deviation) prc.assertProteinHealth(ct.title, protein_health_kwargs) return True
[docs]def measure_distance(ct1, name1, ct2, name2): """ For a given structure ct1 {structure.Structure} and the Plop residue or atom name name1 {string, described convert_plop_name_to_asl} and a second ct2 {structure.Structure}, name2 {string} pair following the same convention, return the minimum distance between the two. If either name matches no values in the structure then None is returned. """ # This could be either an atom name or a residue name list1 = analyze.evaluate_asl(ct1, convert_plop_name_to_asl(name1)) list2 = analyze.evaluate_asl(ct2, convert_plop_name_to_asl(name2)) min_dist = None for iatom1 in list1: for iatom2 in list2: one_dist = measure.measure_distance(ct1.atom[iatom1], ct2.atom[iatom2]) if (min_dist is None or one_dist < min_dist): min_dist = one_dist return min_dist
[docs]def check_distance(filename, name1, name2, distance, tolerance, frames_to_check=None): """ STU workup For the specified structure frames in a filename {string} return True if the minimum distance between the atom or residue name1 {string} is distance {float} +- tolerance{float} Angstroms. Otherwise return false (see partial_structrue_reader for description of frames_to check) """ errors = [] for ct in partial_structure_reader(filename, frames_to_check): one_dist = measure_distance(ct, name1, ct, name2) if (abs(one_dist - distance) > tolerance): msg = "ERROR: %s Distance " % ct.title + \ "{}-{}({:f} A) != {:f} +/- {:f} A".format(name1, name2, one_dist, distance, tolerance) errors.append(msg) if errors: raise AssertionError('\n'.join(errors)) return True
[docs]def check_torsion(filename, name1, name2, name3, name4, value, tolerance=10, frames_to_check=None): """ For the specified structure frames in a filename {string} return True if the torsion between atoms with names name1-4 {string} is value {float} +- tolerance{float} degrees. Otherwise return false (see partial_structure_reader for description of frames_to check) The format for name is <chain>:<residue number>:<pdb atom name> where a _ is replaced with a space (note that the <pdb atom name> should be 4 characters and padded with _ for spaces ie A:12:_CA_. """ errors = [] for ct in partial_structure_reader(filename, frames_to_check): iatoms = [] for name in name1, name2, name3, name4: alist = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name)) if len(alist) != 1: raise RuntimeError("%s in %s should match exactly 1 atom(%d)" % (name, filename, len(alist))) iatoms.append(alist[0]) one_tor = ct.measure(*iatoms) diff = min(abs(one_tor - value), abs(one_tor - 360 - value), abs(one_tor + 360 - value)) if diff > tolerance: output = False msg = "ERROR: %s Torsion " % ct.title + \ "%s(%d)-%s(%d)-%s(%d)-%s(%d)(%f deg) != %f +/- %f deg" % ( name1, iatoms[0], name2, iatoms[1], name3, iatoms[2], name4, iatoms[3], one_tor, value, tolerance) errors.append(msg) if errors: raise AssertionError('\n'.join(errors)) return True
[docs]def check_for_residue_list(filename, residues, frames_to_check=None): """ For the specified structure frames in a filename {string} return True if the residues match the residues listed in residues {string, space seperated list of residue names as given by the __str__ operator of structure._Residue} then return True. (see partial_structrue_reader for description of frames_to check) """ errors = [] input_residues = set(residues.split()) for ct in partial_structure_reader(filename, frames_to_check): ct_residues = {str(res) for res in ct.residue} for input_res in input_residues: if (input_res not in ct_residues): msg = f"{ct.title} has missing residue {input_res}" errors.append(msg) for ct_res in ct_residues: if (ct_res not in input_residues): msg = f"{ct.title} has extra residue {ct_res}" errors.append(msg) if errors: raise AssertionError('\n'.join(errors)) return True
[docs]def check_structure_counts(filename, atoms=None, residues=None, molecules=None, frames_to_check=None): """ STU workup For the specified structure frames in a filename {string} return True if all contain the specified numbers of atoms{integer}, residues{integer}, and molecules{integer}, If any of these are not set then that test is skipped. Otherwise return False. (see partial_structrue_reader for description of frames_to check) """ errors = [] for ct in partial_structure_reader(filename, frames_to_check): if (atoms is not None and len(ct.atom) != atoms): msg = "%s # Atoms %d != %d" % (ct.title, len(ct.atom), atoms) errors.append(msg) if (residues is not None and len(ct.residue) != residues): msg = "%s # Residues %d != %d" % (ct.title, len( ct.residue), residues) errors.append(msg) if (molecules is not None and len(ct.molecule) != molecules): msg = "%s # Molecules %d != %d" % (ct.title, len( ct.molecule), molecules) errors.append(msg) if errors: raise AssertionError('\n'.join(errors)) return True
[docs]def determine_truth_table(filename, property, reference_property, cutoff=None, reverse=False, frames_to_check=None): """ Return a truth table as a dictionary with keys tp, tn, fp, fn & "no prediction" with the number of frames where the prediction is a True Positive, True Negative, False Positive, False Negative, or no prediction was provided. For an input filename{string} containing one or more structures readable by StructureReader. property{string} the property name of predicted value stored as ct-level property filename referecne property{string} the property name of true value stored as ct-level property filename By default the value in property will be converted to a boolean, however the cutoff {variable that supports >=} is defined then a predicted value of True will given whenever the value is greater than this cutoff. If reverse is True then the predicted value will be reversed (this makes the most sense when cutoff is also used) frame_to_check is passed to partial_structure_reader """ truth_table = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'no prediction': 0} for ct in partial_structure_reader(filename, frames_to_check): if (reference_property not in ct.property): continue reference = bool(ct.property[reference_property]) if (property not in ct.property): truth_table['no prediction'] += 1 continue if (cutoff is None): pred = bool(ct.property[property]) else: pred = (ct.property[property] >= cutoff) if (reverse): pred = not pred if (pred and reference): truth_table['tp'] += 1 elif (not pred and not reference): truth_table['tn'] += 1 elif (pred and not reference): truth_table['fp'] += 1 elif (not pred and reference): truth_table['fn'] += 1 return truth_table
[docs]def check_prime_accuracy(min_accuracy, filename, property, reference_property, cutoff=None, reverse=False, frames_to_check=None): """ STU Workup that will check if the accuracy of a predictors is more than some cutoff value and raise an assertion error otherwise usage: :param min_accuracy: 0.0 - 1.0 :type min_accuracy: float :param filename: path of file readable by StructureReader :type filename: str :param property: property name of predicted value stored as ct-level property in the structures within filename :type property: str :param reference_property: property name of the reference value stored as ct-level property filename :type reference_property: str :param cutoff: variable that supports >= on property :type cutoff: int or float :type reverse: bool :param frame_to_check: indicies of frames within filename to process :type frames_to_check: str By default the value in property will be converted to a boolean, however the cutoff {variable that supports >=} is defined then a predicted value of True will given whenever the value is greater than this cutoff. If reverse is True then the predicted value will be reversed (this makes the most sense when cutoff is also used). """ truth_table = determine_truth_table(filename, property, reference_property, cutoff=cutoff, reverse=reverse, frames_to_check=frames_to_check) try: accuracy = old_div(float(truth_table['tp'] + truth_table['tn']), \ (truth_table['tp'] + truth_table['tn'] + truth_table['fp'] + truth_table['fn'])) except ZeroDivisionError: raise AssertionError("No predictions to check") if (accuracy < min_accuracy): raise AssertionError( "Accuracy %5.3f < %5.3f TP %d TN %d FP %d FN %d" % (accuracy, min_accuracy, truth_table['tp'], truth_table['tn'], truth_table['fp'], truth_table['fn'])) return True
[docs]def check_F1_score(min_F1, filename, property, reference_property, cutoff=None, reverse=False, frames_to_check=None): """ STU Workup that will check if the F1 Score of a predictors is more than some cutoff value and raise an assertion error otherwise :param min_f1: 0.0 -1.0 :type min_f1: float :param filename: path of file readable by StructureReader :type filename: str :param property: property name of predicted value stored as ct-level property in the structures within filename :type property: str :param reference_property: property name of the reference value stored as ct-level property filename :type reference_property: str :param cufoff: cutoff variable that supports >= on property :type cutoff: int or float :param reverse: reverse :type reverse: bool :param frame_to_check: indicies of frames within filename to process :type frames_to_check: str By default the value in property will be converted to a boolean, however the cutoff {variable that supports >=} is defined then a predicted value of True will given whenever the value is greater than this cutoff. If reverse is True then the predicted value will be reversed (this makes the most sense when cutoff is also used) """ truth_table = determine_truth_table(filename, property, reference_property, cutoff=cutoff, reverse=reverse, frames_to_check=frames_to_check) try: prec = 1.0 * truth_table['tp'] / (truth_table['tp'] + truth_table['fp']) rec = 1.0 * truth_table['tp'] / (truth_table['tp'] + truth_table['fn']) F1 = 2.0 * prec * rec / (prec + rec) except ZeroDivisionError: F1 = 0.0 if (F1 < min_F1): raise AssertionError("F1Score %5.3f < %5.3f TP %d TN %d FP %d FN %d" % (F1, min_F1, truth_table['tp'], truth_table['tn'], truth_table['fp'], truth_table['fn'])) return True
[docs]def check_ct_property(filename, property_name, value, tolerance="=", frames_to_check=None): """ STU Workup For a specified structure in a filename {string} return True if the property named property_name{string} is equal to value {any type} If tolerance is ">", "<", ">=", "<=" or "in" than use that operation instead of equals. Otherwise is tolerance is provided and not in the above list than return True iff all frames are within tolerance of value. """ errors = [] operator_map = { '<': operator.lt, '<=': operator.le, '>': operator.gt, '>=': operator.ge, '=': operator.eq, '==': operator.eq, '!=': operator.ne, 'in': operator.contains } for ct in partial_structure_reader(filename, frames_to_check): try: prop_value = ct.property[property_name] except KeyError: msg = "Entry {} does not have property {}".format( ct.title, property_name) raise AssertionError(msg) if tolerance in operator_map: if (not operator_map[tolerance](prop_value, value)): msg = "Entry {} property {} ({}) is NOT {} {}".format( ct.title, property_name, prop_value, tolerance, value) errors.append(msg) else: if (abs(prop_value - value) > tolerance): msg = "Entry {} property {} ({}) is NOT == {} +/- {}".format( ct.title, property_name, prop_value, value, tolerance) errors.append(msg) if errors: raise AssertionError('\n'.join(errors)) return True
[docs]def check_ct_property_unique(filename, property_name, tolerance=0.00001, frames_to_check=None): """ Raise an AssertionError if there are entries with duplicated values of a given ct-level property. :param filename: Maestro formated structure file to read :type filename: str :param property_name: ct-level property name. This should start with `r_` or `i_` :type tolerance: float :param tolerance: Values within this tolerance are considered equal :type frames_to_check: list of int :param frames_to_check: check these frames in filename as described in partial structure reader """ values = [] for ct in partial_structure_reader(filename, frames_to_check): if (len(values) > 0): closest = np.min( np.abs(np.array(values) - ct.property[property_name])) if (closest < tolerance): raise AssertionError( "Entry {} in {}".format(ct.title, filename) + " property {}:{:f} is not unique".format( property_name, ct.property[property_name])) values.append(ct.property[property_name]) return True
[docs]def check_hbond(filename, name1, name2, frames_to_check=None, distance_max=2.5, donor_angle=120, acceptor_angle=90): """ Return True if the atom pair specified by `name1` and `name2` is a hydrogen bond in every frame in a structure file. For example, `check_hbond('some_file.maegz', 'A:1:H', 'A:4:O')` :type frames_to_check: list(int) :param frames_to_check: Only cts in `filename` for which their index (starting with 0) are in `frames_to_check` are returned. For example, `check_hbond('some_file.maegz', 'A:1:H', 'A:4:O', "[0, 6, 2]")` will check_hbonds in cts zero, six, and two. All selected cts must have the defined hbond for this check to return True If this argument is not set, then all cts in `filename` are checked. distance_max, donor_angleh, acceptor_angle: See description in schrodinger.structutils.interactions.hbond.match_hbond """ for ct in partial_structure_reader(filename, frames_to_check): list1 = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name1)) list2 = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name2)) if len(list1) != 1: error = "name {} in ct {} in filename {} matches more than one atom" error = error.format(name1, ct.title, filename) raise RuntimeError(error) if len(list2) != 1: error = "name {} in ct {} in filename {} matches more than one atom" error = error.format(name2, ct.title, filename) raise RuntimeError(error) atom1 = ct.atom[list1[0]] atom2 = ct.atom[list2[0]] hbond_found = hbond.match_hbond(atom1, atom2, distance_max, donor_angle, acceptor_angle) if not hbond_found: raise AssertionError('{} is not h-bonded to {} in structure {}'.\ format(name1, name2, ct.title)) return True
[docs]def check_position(filename, name, target_x, target_y, target_z, tolerance, frames_to_check=None): """ For the specified structures in a filename `string` return True if the specified residue or atom identified by `name` is within `tolerance` of the coordinate at `target_x, target_y, target_z`. If a residue is given, then any atom within `tolerance` of the target coordinates will return True. The distance must be less than or equal to the `tolerance`. For example:: check_position('some_file.maegz', 'A:1', 2.38, 0.91, 3.79, 1.0) :param frames_to_check: list(int) :type frames_to_check: Only cts in `filename` for which their index (starting with 0) are in frames_to_check are returned. For example:: check_position('some_file.maegz', 'A:1', 2.38, 0.91, 3.79, 1.0, "[0, 6, 2]") will check the position of residue A:1 in cts zero, six, and two. All selected cts must individually return True for check_position to return True. If this argument is not set, then all cts in `filename` are checked. """ tol_squared = tolerance**2 target_vector = np.array([target_x, target_y, target_z]) for ct in partial_structure_reader(filename, frames_to_check): iatoms = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name)) if len(iatoms) == 0: error = "name {} in ct {} in filename {} matches nothing" error = error.format(name, ct.title, filename) raise RuntimeError(error) atom_within_tol = False for iatom in iatoms: atom = ct.atom[iatom] delta_vec = target_vector - atom.xyz dist_squared = np.dot(delta_vec, delta_vec) if dist_squared <= tol_squared: atom_within_tol = True break if not atom_within_tol: # Example full message: # No atom in A:2:O was found within 2.00 A of # (2.38, 0.91, 3.79) across all cts in dummyfile.maegz msg = 'No atom in {} was found within {:.2f} A of '.\ format(name, tolerance) msg += '({:.2f}, {:.2f}, {:.2f}) '.format(*target_vector) if frames_to_check: msg += f'in cts {frames_to_check} ' else: msg += 'across all cts ' msg += f'in {filename}' raise AssertionError(msg) return True
[docs]def check_sequence(filename, chain, sequence): """ Check that the first structure in a file has a given 1-letter sequence inputs: :param filename: Maestro formatted file to check :type filename: str :param chain: Chain name to check :type chain: str (string of length 1 character) :param sequence: protein sequence to check for. The sequence given here must match that in the structure file exactly to pass. Any chain breaks (no matter the length in sequence space (ie A:2 - A:3 or A:2 to A:103)) are represented by a "-" :type sequence: str of upper-case single character protein codes :return: True is returned if this criteria is met, An AssertionError is raised if this criteria is not met. :rtype: bool """ ct = structure.Structure.read(filename) ct_sequence = "" for one_chain in ct.chain: if one_chain.name != chain: continue last_res = None for res in structure.get_residues_by_connectivity(one_chain): # Skip if this is an isolated residue if len(res.atom[1].getMolecule().residue) == 1: continue # Add a "-" if there is a chain break if last_res is not None and not last_res.isConnectedToResidue(res): ct_sequence += "-" ct_sequence += res.getCode() last_res = res if sequence != ct_sequence: raise AssertionError("Provided Sequence: %s\nSequence in File : %s" % (sequence, ct_sequence)) return True