Source code for schrodinger.test.stu.outcomes.structure_comparisons

"""
Provides functions for comparing structure files.  Supersedes the
compare_mae_files workup requested in QA-293. This set of workups was
requested in QA-1165.

Can be run as a script to compare two files::

    $SCHRODINGER/run structure_comparisons.py filename.mae reference_file.mae

@copyright: (c) Schrodinger, LLC All rights reserved.
"""

import math as _math
import pprint as _pprint
from collections import defaultdict as _defaultdict
from itertools import zip_longest

from schrodinger.infra import mm
from schrodinger.structure import StructureReader as _StructureReader
from schrodinger.structutils import analyze as _analyze
from schrodinger.structutils import rmsd as _rmsd
from schrodinger.structutils import smiles as _smiles
from schrodinger.test.stu import common
from schrodinger.utils import log

from . import failures as failure_types

logger = common.logger

_DEFAULT_TOLERANCE = 0.005
# To ignore a property.
IGNORE = 'IGNORE'


[docs]class StructureMismatch(failure_types.WorkupFailure): failure_type = 'workup: structure mismatch'
[docs]class CompositeFailure(StructureMismatch): """ Failures with lists or dicts of nested differences. """ def __str__(self): return "{}\n{}".format(self.args[0], self.formattedDetails()) __repr__ = __str__
[docs] def formattedDetails(self, width=80, max_number=5): """ Return a formatted version of the detailed diffs. If max_number is not -1, only max_number of diffs will be returned. """ if logger.level < log.INFO: max_number = -1 formatted = _pprint.pformat(self._getShortenedDetails(max_number), width=width) return formatted
def _getShortenedDetails(self, max_number=5): """ If max_number is not -1, only max_number of diffs will be returned. """ if max_number in (-1, None): return self.args[1] if hasattr(self.args[1], 'items'): return self._shortenDict(self.args[1], max_number) elif hasattr(self.args[1], 'append'): return self._shortenList(self.args[1], max_number) else: return self.args[1] def _shortenList(self, original, max_number=5): """Shorten a list.""" if max_number in (-1, None): return original shorter = [] short_warning = ( 'Printed only first %s items, use verbose to see all.' % max_number) for item in sorted(original[:max_number]): if hasattr(item, 'items'): shorter.append(self._shortenDict(item)) elif hasattr(item, 'append'): shorter.append(self._shortenList(item)) else: shorter.append(item) if len(shorter) != len(original): shorter.append(short_warning) return shorter def _shortenDict(self, original, max_number=5): """Shorten a dict.""" if max_number in (-1, None): return original shorter = {} short_warning = ( 'Printed only first %s items, use verbose to see all.' % max_number) for k, v in sorted(original.items())[:max_number]: try: if hasattr(v, 'items'): shorter[k] = self._shortenDict(v) elif hasattr(v, 'append'): shorter[k] = self._shortenList(v) else: shorter[k] = v except RuntimeError: shorter[k] = "another dict" if len(shorter) != len(original): shorter[short_warning] = '' return dict(shorter)
[docs]def compare_ct_properties(check, reference, *properties, ordered=True, **tolerances): """ Check that the values of the CT-level properties in `properties` match in `check` and `reference` across all cts. If no properties are listed, all are compared. Default tolerance is 0.005, but can be overridden with the tolerance keyword "tol". String properties are required to be identical. If the special tolerance value "IGNORE" appears in the tolerance dictionary, that property is ignored. examples:: # Compare all CT properties with a tolerance of +/- 0.05 compare_ct_properties('chk.mae', 'ref.mae', tol=0.05) # Compare r_m_pdb_PDB_R and s_m_source_file with a tolerance of +/- # 0.05 on numeric comparisons. compare_ct_properties('chk.mae', 'ref.mae', 'r_m_pdb_PDB_R', 's_m_source_file', tol=0.05) # Compare r_m_pdb_PDB_R and i_m_source_file_index with a tolerance of # +/- 1 on i_m_source_file_index, use the default tolerance of 0.005 on # other numeric comparisons. compare_ct_properties('chk.mae', 'ref.mae', 'r_m_pdb_PDB_R', 'i_m_source_file_index', i_m_source_file_index=1) # Compare all atom properties except the string property # s_m_source_file. compare_atom_properties('chk.mae', 'ref.mae', s_m_source_file='IGNORE') :type check: str :param check: First filename. :type reference: str :param reference: Reference filename. :param properties: Properties to check. :param bool ordered: If true, assume the order of the structures in the two files is the same. If False, structures will be matched based on title. :param tolerances: Tolerances by which each should be compared. Keys are property names, values are tolerances. :raise AssertionError: If properties do not match. :rtype: bool :return: Did all properties match? """ failures = {} if ordered: pair_getter = _zip_structures else: pair_getter = _ordered_structure_pairs for st_index, st1, st2 in pair_getter(check, reference): p1 = dict(st1.property) p1[mm.MMCT_STEREO_STATUS_PROP] = mm.mmct_ct_get_stereo_status(st1) p2 = dict(st2.property) p2[mm.MMCT_STEREO_STATUS_PROP] = mm.mmct_ct_get_stereo_status(st2) these_failures = _check_properties(p1, p2, properties, tolerances) if these_failures: failures["Structure %02d" % st_index] = these_failures if failures: msg = (f"CT properties did not match in {check} and {reference}.") raise CompositeFailure(msg, failures) return True
[docs]def compare_st_file_properties(check, reference, tolerance=_DEFAULT_TOLERANCE): """Alias for older workup.""" return compare_ct_properties(check, reference, tol=tolerance)
[docs]def compare_atom_properties(check, reference, *properties, **tolerances): """ Check that the values of the atom-level properties in `properties` match in in `check` and `reference` across all cts. If no properties are listed, all are compared. Default tolerance is 0.005, but can be overridden with the tolerance keyword "tol". String properties are required to be identical. If the special tolerance value "IGNORE" appears in the tolerance dictionary, that property is ignored. examples:: # Compare all atom properties with a tolerance of +/- 0.05 on numeric # comparisons. compare_atom_properties('chk.mae', 'ref.mae', tol=0.05) # Compare r_m_pdb_tfactor and s_m_chain_name with a tolerance of +/- # 0.05 on numeric comparisons (r_m_pdb_tfactor). The string comparison # (s_m_chain_name) is still required to match exactly. compare_atom_properties('chk.mae', 'ref.mae', 'r_m_pdb_tfactor', 's_m_chain_name', tol=0.05) # Compare r_m_pdb_tfactor and i_m_residue_number. Use a tolerance of # +/- 1 on i_m_residue_number, use the default tolerance of 0.005 on # other numeric comparisons (that is, r_m_pdb_tfactor). compare_atom_properties('chk.mae', 'ref.mae', 'r_m_pdb_tfactor', 'i_m_residue_number', i_m_residue_number=1) # Compare all atom properties except the string property s_m_chain_name compare_atom_properties('chk.mae', 'ref.mae', s_m_chain_name='IGNORE') :type check: str :param check: First filename. :type reference: str :param reference: Reference filename. :param properties: Properties to check. :param tolerances: Tolerances by which each should be compared. Keys are property names, values are tolerances. :raise AssertionError: If properties do not match. :rtype: bool :return: Did all properties match? """ failures = _defaultdict(dict) for st_index, st1, st2 in _zip_structures(check, reference): for atom_index, (atom1, atom2) in enumerate(zip(st1.atom, st2.atom), start=1): these_failures = _check_properties(atom1.property, atom2.property, properties, tolerances) if these_failures: atom_format = "Atom %%0%id" % (_math.log10(len(st1.atom)) + 1) failures["Structure %02d" % st_index][atom_format % atom_index] = these_failures if failures: msg = ("Atom properties did not match in {} and {}.".format( check, reference)) #Must cast as dict - prettyprint cannot deal with defaultdicts. raise CompositeFailure(msg, dict(failures)) return True
[docs]def compare_bond_properties(check, reference, *properties, **tolerances): """ Check that the values of the bond-level properties in `properties` match in `check` and `reference` across all cts. If no properties are listed, all are compared. Default tolerance is 0.005, but can be overridden with the tolerance keyword "tol". String properties are required to be identical. If the special tolerance value "IGNORE" appears in the tolerance dictionary, that property is ignored. :type check: str :param check: First filename. :type reference: str :param reference: Reference filename. :param properties: Properties to check. :param tolerances: Tolerances by which each should be compared. Keys are property names, values are tolerances. :raise AssertionError: If properties do not match. :rtype: bool :return: Did all properties match? """ failures = _defaultdict(dict) for st_index, st1, st2 in _zip_structures(check, reference): bonds1 = sorted(st1.bond, key=lambda b: (b.atom1.index, b.atom2.index)) bonds2 = sorted(st2.bond, key=lambda b: (b.atom1.index, b.atom2.index)) for bond1, bond2 in zip_longest(bonds1, bonds2): if not bond1 or not bond2: failures["Structure %02d" % st_index]['bond count mismatch'] = '' break these_failures = _check_properties(bond1.property, bond2.property, properties, tolerances) if these_failures: failures["Structure %02d" % st_index]["{} vs {}".format( bond1, bond2)] = these_failures if failures: msg = ("Bond properties did not match in {} and {}.".format( check, reference)) #Must cast as dict - prettyprint cannot deal with defaultdicts. raise CompositeFailure(msg, dict(failures)) return True
def _move_args_to_kwargs(args, kwargs): """ Split arguments with equals in them. This lets us emulate keyword arguments in STU workups. Returns the updated args list. Anything in `args` that has an "=" sign in it is moved to the `kwargs` dict. This is required because STU workups are not fully parsed to python-like arglist + kwarglist. """ def process1arg(arg): """Returns True if the property should be retained in the original list""" if '=' in arg: k, v = arg.split('=') try: kwargs[k] = float(v) return False except ValueError: if v.upper() == IGNORE: kwargs[k] = IGNORE return False return True updated_args = [arg for arg in args if process1arg(arg)] return updated_args def _check_properties(check, reference, properties=None, tolerances=None): """ Check the property dictionaries of two things, ensuring that the listed properties match. If no properties are listed, all are compared. If the special tolerance value "IGNORE" appears in the tolerance dictionary, that property is ignored. Properties with "=" in the name are considered tolerances, and are split on the "=". This is because of the way the arguments are passed to STU workups. """ failures = [] tolerances = tolerances or dict() # Split arguments with equals in them. This lets us emulate keyword # arguments in STU workups. properties = _move_args_to_kwargs(properties, tolerances) default_tolerance = tolerances.get('tol', _DEFAULT_TOLERANCE) if not properties: properties = set(list(check)) | set(list(reference)) format = "%s: %s != %s" for prop in properties: # If neither structure has the property, skip it. We still want # errors for new properties, though. if prop not in check and prop not in reference: continue # Properties can be ignored: SHARED-3801 if tolerances.get(prop, '') == IGNORE: continue try: diff = abs(check[prop] - reference[prop]) if diff > tolerances.get(prop, default_tolerance): failures.append(format % (prop, check[prop], reference[prop])) except KeyError: failures.append(prop + " is missing") except TypeError: #This is a string. if not check[prop] == reference[prop]: # Does an equality check as because Color objects lack an # __ne__ operator. See PYTHON-2489 failures.append(format % (prop, check[prop], reference[prop])) return failures
[docs]def rmsd(check, reference, tolerance=0.05, asl='all and NOT atom.element H', check_cts='', use_symmetry=False, in_place=False): """ After rotating molecules into the same frame of reference, check that the RMSD between matched cts in `check` and `reference` are below `tolerance`. If `asl` is provided, only matching atoms are compared. If there are multiple structures in the files, all will be compared unless `check_cts` is provided. `check_cts` is a comma separated list of CT indices, starting with ct 1. 'use_symmetry' indicates whether the rmsd calculations should take symmetry into account. If your structures contain dummy atoms, use `rmsd_in_place`. """ if isinstance(tolerance, str): msg = f"tolerance must be a number (found \"{tolerance}\")" raise TypeError(msg) failures = [] check_cts = str(check_cts).split(',') check_cts = {int(x) for x in check_cts if x} checked_cts = set() for (i, st1, st2) in _zip_structures(check, reference): if check_cts: if i not in check_cts: continue else: checked_cts.add(i) atom_list1 = _analyze.evaluate_asl(st1, asl) atom_list2 = _analyze.evaluate_asl(st2, asl) if len(atom_list1) != len(atom_list2): raise ValueError( 'Can not compare structures with diifferent ' + f' number of atoms ({len(atom_list1)} & {len(atom_list2)})') try: if in_place: value = _rmsd.calculate_in_place_rmsd(st1, atom_list1, st2, atom_list2, use_symmetry=use_symmetry) else: value = _rmsd.superimpose(st1, atom_list1, st2, atom_list2, use_symmetry=use_symmetry) except ValueError as err: # Superposition requires at least 3 atoms to match the ASL, we # should use either a more generous ASL or not do superposition. if "at least 3 atoms" not in str(err): raise # Run RMSD on all atoms atom_list1 = list(range(1, st1.atom_total + 1)) atom_list2 = list(range(1, st2.atom_total + 1)) three_or_less_atoms = not (st1.atom_total > 3 and st2.atom_total > 3) if three_or_less_atoms: value = _rmsd.calculate_in_place_rmsd(st1, atom_list1, st2, atom_list2, use_symmetry=use_symmetry) else: value = _rmsd.superimpose(st1, atom_list1, st2, atom_list2, use_symmetry=use_symmetry) if value > tolerance: failures.append("%i: %s" % (i, value)) if failures: msg = "RMSDs between structures in {} and {} were too large.".format( check, reference) raise CompositeFailure(msg, failures) check_cts -= checked_cts if check_cts: msg = "Requested CTs for RMSD were missing: %s" % ', '.join( str(x) for x in check_cts) raise failure_types.WorkupFailure(msg) return True
[docs]def rmsd_in_place(check, reference, tolerance=0.05, asl='all and NOT atom.element H', check_cts='', use_symmetry=False): """ Check that that RMSD between matched cts in `check` and `reference` are below `tolerance` in their current orientation. This is mostly useful for systems that contain dummy atoms, otherwise you'll want the `rmsd` workup. If there are multiple structures in the files, all will be compared unless `check_cts` is provided. `check_cts` is a comma separated list of CT indices, starting with ct 1. 'use_symmetry' is a boolean to indicate whether or not the RMSD calculation should take symmetry into account. """ return_val = rmsd(check, reference, tolerance, asl, check_cts, use_symmetry=use_symmetry, in_place=True) return return_val
[docs]def compare_bonding(check, reference): """ Check that the connectivity of each cts in `check` matches the connectivity of the corresponding ct in `reference`. """ smiles_generator = _smiles.SmilesGenerator( stereo=_smiles.STEREO_FROM_ANNOTATION_AND_GEOM) failures = [] for (i, st1, st2) in _zip_structures(check, reference): smiles1 = smiles_generator.getSmiles(st1) smiles2 = smiles_generator.getSmiles(st2) if smiles1 != smiles2: failures.append("Structure {}: {} != {}".format( i, smiles1, smiles2)) if failures: msg = ("Connectivities of structures in %s and %s did not match." % (check, reference)) raise CompositeFailure(msg, failures) return True
[docs]def compare_chiralities(check, reference): """ Check that the chiralities of each ct in `check` matches the chirality of the corresponding ct in `reference`. Both the chiral label and the CIP ranked list of neighbors are checked. """ failures = [] for (i, st1, st2) in _zip_structures(check, reference): chiralities1 = _analyze.get_chiralities(st1) chiralities2 = _analyze.get_chiralities(st2) if chiralities1 != chiralities2: chiralities1 = sorted(chiralities1.items()) chiralities2 = sorted(chiralities2.items()) failures.append(f"Structure {i}: {chiralities1} != {chiralities2}") if failures: msg = ("Chiralities of structures in %s and %s did not match." % (check, reference)) raise CompositeFailure(msg, failures) return True
[docs]def compare_num_atoms(check, reference, tolerance=0): """ Compare number of atoms in every CT of structure file `check` against `reference`. The number can differ by as much as `tolerance`. """ if isinstance(tolerance, str): msg = f"tolerance must be a number (found \"{tolerance}\")" raise TypeError(msg) failures = {} for (i, test_ct, ref_ct) in _zip_structures(check, reference): if abs(ref_ct.atom_total - test_ct.atom_total) > tolerance: failures["Structure %i" % i] = "%i != %i" % (test_ct.atom_total, ref_ct.atom_total) if failures: msg = ("Atom counts of structures in %s and %s did not match." % (check, reference)) raise CompositeFailure(msg, failures) return True
# It makes no sense to pass properties to compare into compare_mae_files, because # the properties will certainly not exist at both the CT and atom level. # However, STU cannot actually pass keyword arguments yet, so keyword arguments # are faked in _check_properties
[docs]def compare_mae_files(check, reference, *tolerance_list, **tolerances): """ Check that all atom, bond and CT level properties of matched cts in `check` and `reference` match within some tolerance. Also check RMSDs between all structures and bond orders. Tolerances should be specified as keyword/value pairs. Default tolerance is 0.005, but can be overridden with the tolerance keyword "tol". String properties are required to be identical. If the special tolerance value "IGNORE" appears in the tolerance dictionary, that property is ignored. RMSD tolerance can be specified with the tolerance keyword "rmsd". examples:: compare_mae_files('chk.mae', 'ref.mae', tol=0.05) Compare all atom, bond, and CT properties with a tolerance of +/- 0.05. Also compare RMSD and bonding. compare_mae_files('chk.mae', 'ref.mae', tol=0.05, rmsd=0.05) Compare all atom, bond, and CT properties with a tolerance of +/- 0.05. Also compare RMSD with a tolerance of 0.05, and compare bonding. compare_mae_files('chk.mae', 'ref.mae', r_m_pdb_tfactor=0.05, i_m_source_file_index=1) Compare all atom, bond, and CT properties, with a tolerance of +-0.05 for r_m_pdb_tfactor and +/-1 for i_m_source_file_index; other numeric comparisons use the default tolerance of 0.005. Also compare RMSD and bonding. compare_mae_files('chk.mae', 'ref.mae', r_m_pdb_tfactor=0.05, s_m_chain_name='IGNORE') Compare all atom, bond, and CT properties except s_m_chain_name, with a tolerance of +-0.05 for r_m_pdb_tfactor; other numeric comparisons use the default tolerance of 0.005. Also compare RMSD and bonding. :type check: str :param check: First filename. :type reference: str :param reference: Reference filename. :param tolerance_list: Tolerances by which each should be compared. :param tolerances: Tolerances by which each should be compared. :raise AssertionError: If properties do not match. :rtype: bool :return: Did all properties match? """ compare_bonding(check, reference) tolerance_list = _move_args_to_kwargs(tolerance_list, tolerances) rmsd(check, reference, tolerances.get('rmsd', 0.05)) compare_ct_properties(check, reference, *tolerance_list, **tolerances) compare_atom_properties(check, reference, *tolerance_list, **tolerances) return compare_bond_properties(check, reference, *tolerance_list, **tolerances)
def _zip_structures(check, reference): """ Iterate over the structures in two files. If there is a length mismatch, raise an AssertionError. """ try: reader1 = _StructureReader(check) reader2 = _StructureReader(reference) for (i, (ct1, ct2)) in enumerate(zip_longest(reader1, reader2), start=1): if not ct1: msg = f"{reference} has more entries than {check}." raise StructureMismatch(msg) elif not ct2: msg = f"{check} has more entries than {reference}." raise StructureMismatch(msg) else: yield i, ct1, ct2 except StopIteration: return def _ordered_structure_pairs(check, reference): """ Match structures in the check file with those in the reference file by title. Assumes all structures have unique titles. :param str check: Path to the check structure file :param str reference: Path to the reference structure file :rtype: A generator of (int, `structure.Structure`, `structure.Structure`) :return: The int is the structure index from the reference file, the first structure is from the check file, the second is from the reference file. :raise AssertionError: If a structure in one file is not found in the other """ check_sts = {} for check_index, struct in enumerate(_StructureReader(check), 1): check_sts[struct.title] = check_index for index, ref_st in enumerate(_StructureReader(reference), 1): try: check_index = check_sts.pop(ref_st.title) except KeyError: msg = f'Check file has no structure with title: {ref_st.title}' raise AssertionError(msg) check_st = _StructureReader.read(check, index=check_index) yield index, check_st, ref_st if check_sts: titles = ', '.join(check_sts.keys()) msg = f'Extra structures found in check file: {titles}' raise AssertionError(msg) if __name__ == '__main__': # Run compare_mae_files on file vs reference file import argparse description = ("Use compare_mae_files to check whether two structure " "files are the same.") parser = argparse.ArgumentParser(description=description) parser.add_argument('test_file') parser.add_argument('reference_file') opts = parser.parse_args() compare_mae_files(opts.test_file, opts.reference_file)