Source code for schrodinger.test.stu.outcomes.structure_comparisons

"""
Provides functions for comparing structure files.  Supersedes the
compare_mae_files workup requested in QA-293. This set of workups was
requested in QA-1165.

Can be run as a script to compare two files::

    $SCHRODINGER/run structure_comparisons.py filename.mae reference_file.mae

@copyright: (c) Schrodinger, LLC All rights reserved.
"""

import math as _math
import pprint as _pprint
from collections import defaultdict as _defaultdict
from itertools import zip_longest

from schrodinger.infra import mm
from schrodinger.structure import StructureReader as _StructureReader
from schrodinger.structutils import analyze as _analyze
from schrodinger.structutils import rmsd as _rmsd
from schrodinger.structutils import smiles as _smiles
from schrodinger.test.stu import common
from schrodinger.utils import log

from . import failures as failure_types

logger = common.logger

_DEFAULT_TOLERANCE = 0.005
# To ignore a property.
IGNORE = 'IGNORE'


[docs]class StructureMismatch(failure_types.WorkupFailure):
    failure_type = 'workup: structure mismatch'


[docs]class CompositeFailure(StructureMismatch):
    """
    Failures with lists or dicts of nested differences.
    """

    def __str__(self):
        return "{}\n{}".format(self.args[0], self.formattedDetails())

    __repr__ = __str__

[docs]    def formattedDetails(self, width=80, max_number=5):
        """
        Return a formatted version of the detailed diffs.  If max_number is not
        -1, only max_number of diffs will be returned.
        """
        if logger.level < log.INFO:
            max_number = -1
        formatted = _pprint.pformat(self._getShortenedDetails(max_number),
                                    width=width)
        return formatted

    def _getShortenedDetails(self, max_number=5):
        """
        If max_number is not -1, only max_number of diffs will be returned.
        """
        if max_number in (-1, None):
            return self.args[1]

        if hasattr(self.args[1], 'items'):
            return self._shortenDict(self.args[1], max_number)
        elif hasattr(self.args[1], 'append'):
            return self._shortenList(self.args[1], max_number)
        else:
            return self.args[1]

    def _shortenList(self, original, max_number=5):
        """Shorten a list."""
        if max_number in (-1, None):
            return original
        shorter = []
        short_warning = (
            'Printed only first %s items, use verbose to see all.' % max_number)
        for item in sorted(original[:max_number]):
            if hasattr(item, 'items'):
                shorter.append(self._shortenDict(item))
            elif hasattr(item, 'append'):
                shorter.append(self._shortenList(item))
            else:
                shorter.append(item)

        if len(shorter) != len(original):
            shorter.append(short_warning)
        return shorter

    def _shortenDict(self, original, max_number=5):
        """Shorten a dict."""
        if max_number in (-1, None):
            return original
        shorter = {}
        short_warning = (
            'Printed only first %s items, use verbose to see all.' % max_number)
        for k, v in sorted(original.items())[:max_number]:
            try:
                if hasattr(v, 'items'):
                    shorter[k] = self._shortenDict(v)
                elif hasattr(v, 'append'):
                    shorter[k] = self._shortenList(v)
                else:
                    shorter[k] = v
            except RuntimeError:
                shorter[k] = "another dict"

        if len(shorter) != len(original):
            shorter[short_warning] = ''
        return dict(shorter)


[docs]def compare_ct_properties(check,
                          reference,
                          *properties,
                          ordered=True,
                          **tolerances):
    """
    Check that the values of the CT-level properties in `properties` match in
    `check` and `reference` across all cts. If no properties are listed, all
    are compared. Default tolerance is 0.005, but can be overridden with the
    tolerance keyword "tol". String properties are required to be identical. If
    the special tolerance value "IGNORE" appears in the tolerance dictionary,
    that property is ignored.

    examples::

        # Compare all CT properties with a tolerance of +/- 0.05
        compare_ct_properties('chk.mae', 'ref.mae', tol=0.05)
        # Compare r_m_pdb_PDB_R and s_m_source_file with a tolerance of +/-
        # 0.05 on numeric comparisons.
        compare_ct_properties('chk.mae', 'ref.mae', 'r_m_pdb_PDB_R',
                              's_m_source_file', tol=0.05)
        # Compare r_m_pdb_PDB_R and i_m_source_file_index with a tolerance of
        # +/- 1 on i_m_source_file_index, use the default tolerance of 0.005 on
        # other numeric comparisons.
        compare_ct_properties('chk.mae', 'ref.mae', 'r_m_pdb_PDB_R',
                              'i_m_source_file_index', i_m_source_file_index=1)
        # Compare all atom properties except the string property
        # s_m_source_file.
        compare_atom_properties('chk.mae', 'ref.mae', s_m_source_file='IGNORE')

    :type check: str
    :param check: First filename.
    :type reference: str
    :param reference: Reference filename.
    :param properties: Properties to check.
    :param bool ordered: If true, assume the order of the structures in the
        two files is the same. If False, structures will be matched based on
        title.
    :param tolerances: Tolerances by which each should be compared. Keys are
                       property names, values are tolerances.
    :raise AssertionError: If properties do not match.
    :rtype: bool
    :return: Did all properties match?
    """

    failures = {}
    if ordered:
        pair_getter = _zip_structures
    else:
        pair_getter = _ordered_structure_pairs

    for st_index, st1, st2 in pair_getter(check, reference):
        p1 = dict(st1.property)
        p1[mm.MMCT_STEREO_STATUS_PROP] = mm.mmct_ct_get_stereo_status(st1)
        p2 = dict(st2.property)
        p2[mm.MMCT_STEREO_STATUS_PROP] = mm.mmct_ct_get_stereo_status(st2)
        these_failures = _check_properties(p1, p2, properties, tolerances)
        if these_failures:
            failures["Structure %02d" % st_index] = these_failures
    if failures:
        msg = (f"CT properties did not match in {check} and {reference}.")
        raise CompositeFailure(msg, failures)
    return True


[docs]def compare_st_file_properties(check, reference, tolerance=_DEFAULT_TOLERANCE):
    """Alias for older workup."""
    return compare_ct_properties(check, reference, tol=tolerance)


[docs]def compare_atom_properties(check, reference, *properties, **tolerances):
    """
    Check that the values of the atom-level properties in `properties` match in
    in `check` and `reference` across all cts.  If no properties are listed, all
    are compared. Default tolerance is 0.005, but can be overridden with the
    tolerance keyword "tol". String properties are required to be identical. If
    the special tolerance value "IGNORE" appears in the tolerance dictionary, that
    property is ignored.

    examples::

        # Compare all atom properties with a tolerance of +/- 0.05 on numeric
        # comparisons.
        compare_atom_properties('chk.mae', 'ref.mae', tol=0.05)
        # Compare r_m_pdb_tfactor and s_m_chain_name with a tolerance of +/-
        # 0.05 on numeric comparisons (r_m_pdb_tfactor).  The string comparison
        # (s_m_chain_name) is still required to match exactly.
        compare_atom_properties('chk.mae', 'ref.mae', 'r_m_pdb_tfactor',
                                's_m_chain_name', tol=0.05)
        # Compare r_m_pdb_tfactor and i_m_residue_number. Use a tolerance of
        # +/- 1 on i_m_residue_number, use the default tolerance of 0.005 on
        # other numeric comparisons (that is, r_m_pdb_tfactor).
        compare_atom_properties('chk.mae', 'ref.mae', 'r_m_pdb_tfactor',
                                'i_m_residue_number', i_m_residue_number=1)
        # Compare all atom properties except the string property s_m_chain_name
        compare_atom_properties('chk.mae', 'ref.mae', s_m_chain_name='IGNORE')

    :type check: str
    :param check: First filename.
    :type reference: str
    :param reference: Reference filename.
    :param properties: Properties to check.
    :param tolerances: Tolerances by which each should be compared. Keys are
                       property names, values are tolerances.
    :raise AssertionError: If properties do not match.
    :rtype: bool
    :return: Did all properties match?
    """
    failures = _defaultdict(dict)

    for st_index, st1, st2 in _zip_structures(check, reference):
        for atom_index, (atom1, atom2) in enumerate(zip(st1.atom, st2.atom),
                                                    start=1):
            these_failures = _check_properties(atom1.property, atom2.property,
                                               properties, tolerances)
            if these_failures:
                atom_format = "Atom %%0%id" % (_math.log10(len(st1.atom)) + 1)
                failures["Structure %02d" %
                         st_index][atom_format % atom_index] = these_failures

    if failures:
        msg = ("Atom properties did not match in {} and {}.".format(
            check, reference))
        #Must cast as dict - prettyprint cannot deal with defaultdicts.
        raise CompositeFailure(msg, dict(failures))
    return True


[docs]def compare_bond_properties(check, reference, *properties, **tolerances):
    """
    Check that the values of the bond-level properties in `properties` match
    in `check` and `reference` across all cts.  If no properties are listed,
    all are compared. Default tolerance is 0.005, but can be overridden with
    the tolerance keyword "tol". String properties are required to be
    identical. If the special tolerance value "IGNORE" appears in the tolerance
    dictionary, that property is ignored.

    :type check: str
    :param check: First filename.
    :type reference: str
    :param reference: Reference filename.
    :param properties: Properties to check.
    :param tolerances: Tolerances by which each should be compared. Keys are
                       property names, values are tolerances.
    :raise AssertionError: If properties do not match.
    :rtype: bool
    :return: Did all properties match?
    """
    failures = _defaultdict(dict)

    for st_index, st1, st2 in _zip_structures(check, reference):
        bonds1 = sorted(st1.bond, key=lambda b: (b.atom1.index, b.atom2.index))
        bonds2 = sorted(st2.bond, key=lambda b: (b.atom1.index, b.atom2.index))
        for bond1, bond2 in zip_longest(bonds1, bonds2):
            if not bond1 or not bond2:
                failures["Structure %02d" %
                         st_index]['bond count mismatch'] = ''
                break

            these_failures = _check_properties(bond1.property, bond2.property,
                                               properties, tolerances)
            if these_failures:
                failures["Structure %02d" % st_index]["{} vs {}".format(
                    bond1, bond2)] = these_failures

    if failures:
        msg = ("Bond properties did not match in {} and {}.".format(
            check, reference))
        #Must cast as dict - prettyprint cannot deal with defaultdicts.
        raise CompositeFailure(msg, dict(failures))
    return True


def _move_args_to_kwargs(args, kwargs):
    """
    Split arguments with equals in them.  This lets us emulate keyword
    arguments in STU workups. Returns the updated args list.

    Anything in `args` that has an "=" sign in it is moved to the
    `kwargs` dict. This is required because STU workups are not fully parsed
    to python-like arglist + kwarglist.

    """

    def process1arg(arg):
        """Returns True if the property should be retained in the original list"""
        if '=' in arg:
            k, v = arg.split('=')
            try:
                kwargs[k] = float(v)
                return False
            except ValueError:
                if v.upper() == IGNORE:
                    kwargs[k] = IGNORE
                    return False
        return True

    updated_args = [arg for arg in args if process1arg(arg)]
    return updated_args


def _check_properties(check, reference, properties=None, tolerances=None):
    """
    Check the property dictionaries of two things, ensuring that the listed
    properties match.  If no properties are listed, all are compared. If the
    special tolerance value "IGNORE" appears in the tolerance dictionary, that
    property is ignored.

    Properties with "=" in the name are considered tolerances, and are split on
    the "=".  This is because of the way the arguments are passed to STU
    workups.
    """
    failures = []

    tolerances = tolerances or dict()
    # Split arguments with equals in them.  This lets us emulate keyword
    # arguments in STU workups.
    properties = _move_args_to_kwargs(properties, tolerances)
    default_tolerance = tolerances.get('tol', _DEFAULT_TOLERANCE)

    if not properties:
        properties = set(list(check)) | set(list(reference))

    format = "%s: %s != %s"
    for prop in properties:
        # If neither structure has the property, skip it.  We still want
        # errors for new properties, though.
        if prop not in check and prop not in reference:
            continue

        # Properties can be ignored: SHARED-3801
        if tolerances.get(prop, '') == IGNORE:
            continue

        try:
            diff = abs(check[prop] - reference[prop])
            if diff > tolerances.get(prop, default_tolerance):
                failures.append(format % (prop, check[prop], reference[prop]))
        except KeyError:
            failures.append(prop + " is missing")
        except TypeError:
            #This is a string.
            if not check[prop] == reference[prop]:
                # Does an equality check as because Color objects lack an
                # __ne__ operator.  See PYTHON-2489
                failures.append(format % (prop, check[prop], reference[prop]))
    return failures


[docs]def rmsd(check,
         reference,
         tolerance=0.05,
         asl='all and NOT atom.element H',
         check_cts='',
         use_symmetry=False,
         in_place=False):
    """
    After rotating molecules into the same frame of reference, check that the
    RMSD between matched cts in `check` and `reference` are below
    `tolerance`.  If `asl` is provided, only matching atoms are compared.
    If there are multiple structures in the files, all will be compared unless
    `check_cts` is provided.  `check_cts` is a comma separated list of CT
    indices, starting with ct 1.
    'use_symmetry' indicates whether the rmsd calculations should take
    symmetry into account.

    If your structures contain dummy atoms, use `rmsd_in_place`.
    """
    if isinstance(tolerance, str):
        msg = f"tolerance must be a number (found \"{tolerance}\")"
        raise TypeError(msg)

    failures = []
    check_cts = str(check_cts).split(',')
    check_cts = {int(x) for x in check_cts if x}
    checked_cts = set()
    for (i, st1, st2) in _zip_structures(check, reference):
        if check_cts:
            if i not in check_cts:
                continue
            else:
                checked_cts.add(i)
        atom_list1 = _analyze.evaluate_asl(st1, asl)
        atom_list2 = _analyze.evaluate_asl(st2, asl)
        if len(atom_list1) != len(atom_list2):
            raise ValueError(
                'Can not compare structures with diifferent ' +
                f' number of atoms ({len(atom_list1)} & {len(atom_list2)})')
        try:
            if in_place:
                value = _rmsd.calculate_in_place_rmsd(st1,
                                                      atom_list1,
                                                      st2,
                                                      atom_list2,
                                                      use_symmetry=use_symmetry)
            else:
                value = _rmsd.superimpose(st1,
                                          atom_list1,
                                          st2,
                                          atom_list2,
                                          use_symmetry=use_symmetry)
        except ValueError as err:
            # Superposition requires at least 3 atoms to match the ASL, we
            # should use either a more generous ASL or not do superposition.
            if "at least 3 atoms" not in str(err):
                raise
            # Run RMSD on all atoms
            atom_list1 = list(range(1, st1.atom_total + 1))
            atom_list2 = list(range(1, st2.atom_total + 1))
            three_or_less_atoms = not (st1.atom_total > 3 and
                                       st2.atom_total > 3)
            if three_or_less_atoms:
                value = _rmsd.calculate_in_place_rmsd(st1,
                                                      atom_list1,
                                                      st2,
                                                      atom_list2,
                                                      use_symmetry=use_symmetry)
            else:
                value = _rmsd.superimpose(st1,
                                          atom_list1,
                                          st2,
                                          atom_list2,
                                          use_symmetry=use_symmetry)
        if value > tolerance:
            failures.append("%i: %s" % (i, value))

    if failures:
        msg = "RMSDs between structures in {} and {} were too large.".format(
            check, reference)
        raise CompositeFailure(msg, failures)

    check_cts -= checked_cts
    if check_cts:
        msg = "Requested CTs for RMSD were missing: %s" % ', '.join(
            str(x) for x in check_cts)
        raise failure_types.WorkupFailure(msg)
    return True


[docs]def rmsd_in_place(check,
                  reference,
                  tolerance=0.05,
                  asl='all and NOT atom.element H',
                  check_cts='',
                  use_symmetry=False):
    """
    Check that that RMSD between matched cts in `check` and `reference` are
    below `tolerance` in their current orientation. This is mostly useful for
    systems that contain dummy atoms, otherwise you'll want the `rmsd` workup.
    If there are multiple structures in the files, all will be compared unless
    `check_cts` is provided.  `check_cts` is a comma separated list of CT
    indices, starting with ct 1. 'use_symmetry' is a boolean to indicate
    whether or not the RMSD calculation should take symmetry into account.
    """

    return_val = rmsd(check,
                      reference,
                      tolerance,
                      asl,
                      check_cts,
                      use_symmetry=use_symmetry,
                      in_place=True)

    return return_val


[docs]def compare_bonding(check, reference):
    """
    Check that the connectivity of each cts in `check` matches the connectivity
    of the corresponding ct in `reference`.
    """
    smiles_generator = _smiles.SmilesGenerator(
        stereo=_smiles.STEREO_FROM_ANNOTATION_AND_GEOM)

    failures = []
    for (i, st1, st2) in _zip_structures(check, reference):
        smiles1 = smiles_generator.getSmiles(st1)
        smiles2 = smiles_generator.getSmiles(st2)
        if smiles1 != smiles2:
            failures.append("Structure {}: {} != {}".format(
                i, smiles1, smiles2))

    if failures:
        msg = ("Connectivities of structures in %s and %s did not match." %
               (check, reference))
        raise CompositeFailure(msg, failures)
    return True


[docs]def compare_chiralities(check, reference):
    """
    Check that the chiralities of each ct in `check` matches the chirality
    of the corresponding ct in `reference`.

    Both the chiral label and the CIP ranked list of neighbors are checked.
    """
    failures = []
    for (i, st1, st2) in _zip_structures(check, reference):
        chiralities1 = _analyze.get_chiralities(st1)
        chiralities2 = _analyze.get_chiralities(st2)
        if chiralities1 != chiralities2:
            chiralities1 = sorted(chiralities1.items())
            chiralities2 = sorted(chiralities2.items())
            failures.append(f"Structure {i}: {chiralities1} != {chiralities2}")

    if failures:
        msg = ("Chiralities of structures in %s and %s did not match." %
               (check, reference))
        raise CompositeFailure(msg, failures)
    return True


[docs]def compare_num_atoms(check, reference, tolerance=0):
    """
    Compare number of atoms in every CT of structure file `check` against
    `reference`. The number can differ by as much as `tolerance`.
    """
    if isinstance(tolerance, str):
        msg = f"tolerance must be a number (found \"{tolerance}\")"
        raise TypeError(msg)

    failures = {}
    for (i, test_ct, ref_ct) in _zip_structures(check, reference):
        if abs(ref_ct.atom_total - test_ct.atom_total) > tolerance:
            failures["Structure %i" %
                     i] = "%i != %i" % (test_ct.atom_total, ref_ct.atom_total)

    if failures:
        msg = ("Atom counts of structures in %s and %s did not match." %
               (check, reference))
        raise CompositeFailure(msg, failures)
    return True


# It makes no sense to pass properties to compare into compare_mae_files, because
# the properties will certainly not exist at both the CT and atom level.
# However, STU cannot actually pass keyword arguments yet, so keyword arguments
# are faked in _check_properties
[docs]def compare_mae_files(check, reference, *tolerance_list, **tolerances):
    """
    Check that all atom, bond and CT level properties of matched cts in `check`
    and `reference` match within some tolerance.  Also check RMSDs between all
    structures and bond orders. Tolerances should be specified as
    keyword/value pairs.

    Default tolerance is 0.005, but can be overridden with the tolerance
    keyword "tol". String properties are required to be identical. If the
    special tolerance value "IGNORE" appears in the tolerance dictionary, that
    property is ignored. RMSD tolerance can be specified with the tolerance
    keyword "rmsd".

    examples::
        compare_mae_files('chk.mae', 'ref.mae', tol=0.05)
            Compare all atom, bond, and CT properties with a tolerance of +/-
            0.05. Also compare RMSD and bonding.
        compare_mae_files('chk.mae', 'ref.mae', tol=0.05, rmsd=0.05)
            Compare all atom, bond, and CT properties with a tolerance of
            +/- 0.05. Also compare RMSD with a tolerance of 0.05, and compare
            bonding.
        compare_mae_files('chk.mae', 'ref.mae', r_m_pdb_tfactor=0.05, i_m_source_file_index=1)
            Compare all atom, bond, and CT properties, with a tolerance of
            +-0.05 for r_m_pdb_tfactor and +/-1 for i_m_source_file_index; other
            numeric comparisons use the default tolerance of 0.005.  Also
            compare RMSD and bonding.
        compare_mae_files('chk.mae', 'ref.mae', r_m_pdb_tfactor=0.05, s_m_chain_name='IGNORE')
            Compare all atom, bond, and CT properties except s_m_chain_name,
            with a tolerance of +-0.05 for r_m_pdb_tfactor; other numeric
            comparisons use the default tolerance of 0.005.  Also compare RMSD
            and bonding.

    :type check: str
    :param check: First filename.
    :type reference: str
    :param reference: Reference filename.
    :param tolerance_list: Tolerances by which each should be compared.
    :param tolerances: Tolerances by which each should be compared.
    :raise AssertionError: If properties do not match.
    :rtype: bool
    :return: Did all properties match?
    """
    compare_bonding(check, reference)
    tolerance_list = _move_args_to_kwargs(tolerance_list, tolerances)
    rmsd(check, reference, tolerances.get('rmsd', 0.05))
    compare_ct_properties(check, reference, *tolerance_list, **tolerances)
    compare_atom_properties(check, reference, *tolerance_list, **tolerances)
    return compare_bond_properties(check, reference, *tolerance_list,
                                   **tolerances)


def _zip_structures(check, reference):
    """
    Iterate over the structures in two files.  If there is a length mismatch,
    raise an AssertionError.

    """
    try:
        reader1 = _StructureReader(check)
        reader2 = _StructureReader(reference)
        for (i, (ct1, ct2)) in enumerate(zip_longest(reader1, reader2),
                                         start=1):
            if not ct1:
                msg = f"{reference} has more entries than {check}."
                raise StructureMismatch(msg)
            elif not ct2:
                msg = f"{check} has more entries than {reference}."
                raise StructureMismatch(msg)
            else:
                yield i, ct1, ct2
    except StopIteration:
        return


def _ordered_structure_pairs(check, reference):
    """
    Match structures in the check file with those in the reference file by
    title. Assumes all structures have unique titles.

    :param str check: Path to the check structure file

    :param str reference: Path to the reference structure file

    :rtype: A generator of (int, `structure.Structure`, `structure.Structure`)
    :return: The int is the structure index from the reference file, the first
        structure is from the check file, the second is from the reference file.

    :raise AssertionError: If a structure in one file is not found in the other
    """

    check_sts = {}
    for check_index, struct in enumerate(_StructureReader(check), 1):
        check_sts[struct.title] = check_index

    for index, ref_st in enumerate(_StructureReader(reference), 1):
        try:
            check_index = check_sts.pop(ref_st.title)
        except KeyError:
            msg = f'Check file has no structure with title: {ref_st.title}'
            raise AssertionError(msg)
        check_st = _StructureReader.read(check, index=check_index)
        yield index, check_st, ref_st
    if check_sts:
        titles = ', '.join(check_sts.keys())
        msg = f'Extra structures found in check file: {titles}'
        raise AssertionError(msg)


if __name__ == '__main__':
    # Run compare_mae_files on file vs reference file

    import argparse
    description = ("Use compare_mae_files to check whether two structure "
                   "files are the same.")
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('test_file')
    parser.add_argument('reference_file')
    opts = parser.parse_args()
    compare_mae_files(opts.test_file, opts.reference_file)