Source code for schrodinger.structutils.analyze

"""
Functions for analyzing `Structure objects<Structure>`.

`AslLigandSearcher` is a class that identifies putative ligands in a structure.
Each putative found ligand is contained in a `Ligand` instance.

There are also a number of functions for using SMARTS, ASL, and SMILES (e.g.
`evaluate_smarts_canvas` or `generate_smiles`).  Other functions return information
about a structure (i.e. `get_chiral_atoms` or `hydrogens_present`). There are
also several SASA (Solvent Accessible Surface Area) functions (i.e.
`calculate_sasa_by_atom` and `calculate_sasa_by_residue` and `calculate_sasa`).

See also the discussion in the Python API overview.

@copyright: Schrodinger, LLC. All rights reserved.
"""

from typing import Iterable
from typing import List
from typing import Optional
from typing import Set
import collections
import itertools
import math
import time
import warnings

import networkx
import numpy

import schrodinger.structutils.measure
from schrodinger import geometry
from schrodinger import get_maestro
from schrodinger import adapter
from schrodinger.infra import canvas
from schrodinger.infra import mm
from schrodinger.infra import mmbitset
from schrodinger.infra import mmerr
from schrodinger.infra import mminit
from schrodinger.infra import mmlist
from schrodinger.infra.mmbitset import Bitset
from schrodinger.structure import NO_STEREO  # noqa: F401
from schrodinger.structure import RESIDUE_MAP_3_TO_1_LETTER  # noqa: F401
from schrodinger.structure import STEREO_FROM_ANNOTATION_AND_GEOM
from schrodinger.structure import Structure
from schrodinger.structure import _Ring
from schrodinger.structure import create_new_structure
from schrodinger.structure import get_residues_by_connectivity
from schrodinger.structutils import transform
from schrodinger.structutils import minimize
from schrodinger.structutils.interactions import hbond
from schrodinger.test import ioredirect
from schrodinger.utils import log
from schrodinger.math import mathutils

smiles = None
_canvas_license = None
_canvas = None

logger = log.get_logger("schrodinger.structutils.analyze")

_initializer = mminit.Initializer(
    [
        mm.mmstereo_initialize, mm.mmpatty_initialize, mm.mmsubs_initialize,
        mm.mmasl_initialize, mm.mmhtreat_initialize, mm.mmcrystal_initialize,
        mm.mmsmiles_initialize
    ],
    [
        mm.mmstereo_terminate, mm.mmpatty_terminate, mm.mmsubs_terminate,
        mm.mmasl_terminate, mm.mmhtreat_terminate, mm.mmcrystal_terminate,
        lambda: None
    ]  # Currently terminating and then reinitializing mmsmiles
    # causes a problem, so don't terminate.
)

RDKIT_MAX_MATCHES = 1000000000  # See DESMOND-11242 and RB 65689


def _get_chiralities(stereo_handle):
    """
    Helper for get_chiral_atoms(st) and get_chiralities(st).

    :type stereo_handle: int
    :param stereo_handle: mmstereo handle for the Structure being
             worked on.

    :rtype: dict
    :return: Dictionary of chiralities keyed off atom index.

    """
    rs = {
        mm.MMSTEREO_CHIRALITY_R: "R",
        mm.MMSTEREO_CHIRALITY_S: "S",
        mm.MMSTEREO_CHIRALITY_ANR: "ANR",
        mm.MMSTEREO_CHIRALITY_ANS: "ANS",
    }  # NOTE: Any other chirality will be returned as "undef"

    (atoms, chiralities) = mm.mmstereo_stereo_atoms(stereo_handle)

    at = mmlist.mmlist(atoms)
    ch = mmlist.mmlist(chiralities)

    result = {}
    for i in range(0, len(at)):
        anum = at[i]
        chirality = ch[i]
        result[anum] = rs.get(chirality, 'undef')

    return result


[docs]def get_chiral_atoms(structure): """ Return a dictionary of chiral atoms, for which the key is the atom index and the value is one of the following strings: "R", "S", "ANR", "ANS", "undef". ANR and ANS designate "chiralities" of non-chiral atoms that are important for determining the structure of the molecule (ex: cis/trans rings). :type structure: `Structure` :param structure: Chirality of atoms within this structure will be determined. :rtype: dict :return: Dictionary of chiralities keyed off atom index. """ stereo = mm.mmstereo_new(structure) result = _get_chiralities(stereo) mm.mmstereo_delete(stereo) return result
[docs]def get_chiralities(structure): """ Return a dictionary of chiral atoms, for which the key is the atom index and the value is a tuple of one of the following strings: "R", "S", "ANR", "ANS", "undef" and the list of CIP ranked neighbors. ANR and ANS designate "chiralities" of non-chiral atoms that are important for determining the structure of the molecule (ex: cis/trans rings). :type structure: `Structure` :param structure: Chirality of atoms within this structure will be determined. :rtype: dict :return: Dictionary of chiralities keyed off atom index. """ stereo = mm.mmstereo_new(structure) result = _get_chiralities(stereo) for at in result.keys(): ch = result[at] bonded_list = mm.mmstereo_get_bonded_mmlist(stereo, at) bonded = mmlist._mmlist_to_pylist(bonded_list) result[at] = (ch, tuple(bonded)) mm.mmstereo_delete(stereo) return result
[docs]def evaluate_smarts(structure, smarts_expression, verbose=False, first_match_only=False, unique_sets=False): """:deprecated: Use `schrodinger.adapter.evaluate_smarts` instead.""" msg = ("analyze.evaluate_smarts() is deprecated. Use " "schrodinger.adapter.evaluate_smarts() instead.") warnings.warn(msg, DeprecationWarning, stacklevel=2) return_list = [] smarts_expression = smarts_expression.strip() if len(smarts_expression) == 0: return return_list # Explicitly compile the pattern because it makes it easier to catch # pattern errors. try: p = mm.mmpatty_new(smarts_expression) except mm.MmException as e: if e.rc == mm.MMPATTY_INVALID_PATTERN: msg = 'Invalid smarts expression: "%s"' % smarts_expression raise ValueError(msg) mm.mmpatty_start_ct(structure, verbose) matches = mm.mmpatty_match_comp(p, first_match_only, verbose) for match in matches: return_list.append(mmlist._mmlist_to_pylist(match)) mm.mmlist_delete(match) mm.mmpatty_finished_ct() mm.mmpatty_delete(p) if unique_sets: filtered_list = [] unique = set() for item in return_list: key = tuple(sorted(item)) if key not in unique: filtered_list.append(item) unique.add(key) return_list = filtered_list return return_list
[docs]def validate_smarts(smarts): """:deprecated: Use `schrodinger.adapter.validate_smarts` instead.""" msg = ("analyze.validate_smarts() is deprecated. Use " "schrodinger.adapter.validate_smarts() instead.") warnings.warn(msg, DeprecationWarning, stacklevel=2) try: p = mm.mmpatty_new(smarts) return [True, ""] except mm.MmException as e: msg = 'Invalid smarts expression: "%s"' % smarts return [False, msg]
[docs]def count_atoms_in_smarts(smarts): """ Return the number of atoms that the given SMARTS pattern has. :param smarts: SMARTS pattern :type smarts: str :return: Number of atoms in the pattern :rtype: int :raises: ValueError is the pattern is invalid. """ # Supress errors so that they don't print to stdout: error_handle = mm.mmpatty_get_error_handler() with mmerr.Level(mm.MMERR_OFF, error_handle): try: patty = mm.mmpatty_new(smarts) except mm.MmException as e: if e.rc == mm.MMPATTY_INVALID_PATTERN: raise ValueError(f'Invalid SMARTS pattern: "{smarts}"') else: raise else: # Get the number of atoms of the pattern: natoms = mm.mmpatty_num_atoms(patty) mm.mmpatty_delete(patty) return natoms
[docs]def lazy_canvas_import(): """ Initialize _canvas and smiles variables. Since the canvas modules may take some time to import, before it an function loading time. """ global _canvas_license, _canvas, smiles if _canvas_license is None: if _canvas is None: import schrodinger.application.canvas.base _canvas = schrodinger.application.canvas.base import schrodinger.application.canvas.utils _canvas_license = schrodinger.application.canvas.utils.get_license( schrodinger.application.canvas.utils.LICENSE_SHARED) if smiles is None: import schrodinger.structutils.smiles as smiles
[docs]def validate_smarts_canvas(smarts): """:deprecated: Use `schrodinger.adapter.validate_smarts` instead.""" lazy_canvas_import() return _canvas.ChmQuery.validateSMARTS(smarts)
[docs]def evaluate_smarts_canvas(structure, smarts, stereo=STEREO_FROM_ANNOTATION_AND_GEOM, start_index=1, uniqueFilter=True, allowRelativeStereo=False, rigorousValidationOfSource=False, hydrogensInterchangeable=False, multiple_smarts=False): """:deprecated: Use `schrodinger.adapter.evaluate_smarts` instead.""" lazy_canvas_import() if isinstance(structure, _canvas.ChmMol): mol = structure else: mol = create_chmmol_from_structure(structure, stereo=stereo) smarts = smarts if multiple_smarts else [smarts] ret = [] for pattern in smarts: pattern = pattern.strip() q = _canvas.ChmQuery(pattern, True) m = _canvas.ChmQueryMatcher(uniqueFilter, allowRelativeStereo, rigorousValidationOfSource, hydrogensInterchangeable) matches = [] for match in m.match(q, mol): matches.append([ a.getMolIndex() + start_index for a in match.getMatchedAtoms() ]) ret.append(matches) return ret if multiple_smarts else ret[0]
[docs]def evaluate_smarts_by_molecule(structure, smarts, timing_data=None, canvas=True, use_rdkit=False, matches_by_mol=False, molecule_numbers=None, **kwargs): """ Takes a structure and a SMARTS pattern and returns a list of all matching atom indices, where each element in the list is a group of atoms that match the the SMARTS pattern. The advantage of this function over evaluate_smarts_canvas is that it does a SMARTS match for each molecule in a structure rather than over the entire structure at once. SMARTS evaluation scales as N^2 with the size of the structure searched. Doing many SMARTS evaluations over small molecules will have a significant speedup over one SMARTS evaluation over a composite structure. The return value of this function is identical to the return value of the evaluate_smarts_canvas function (or evaluate_smarts function if canvas=False) with the possible exception of the order of the matches. Do not use this function if the SMARTS match can span molecules. This simply fails to match invalid SMARTS patterns and also discards any empty matches. Additional keyword arguments are passed to the SMARTS matching function :type structure: structure.Structure :param structure: the structure to search :type smarts: str :param smarts: the SMARTS pattern to match :type timing_data: dict or None :param timing_data: If supplied this dict will be filled with timing data for the SMARTS finding. Data will be recorded for each molecule searched. Keys will be the number of atoms in a molecule, each value will be a list. Each item in the list will be the time in seconds it took to search a molecule with that many atoms. :type canvas: bool :param canvas: If True, use Canvas SMARTS matching, if False, use mmpatty :param bool use_rdkit: Whether to use RDKIT. Cannot be used together with canvas :type matches_by_mol: bool :param matches_by_mol: if True then rather than returning a list of matches return a dictionary of matches key-ed by molecule number :type molecule_numbers: set :param molecule_numbers: set of molecule numbers in the structure to be used instead of the entire structure :rtype: list or dict :return: For the list (if matches_by_mol is False) each value is a list of atom indices matching the SMARTS pattern, for the dict (if matches_by_mol is True) keys are molecule indices and values are lists of matches for that molecule """ assert not (canvas and use_rdkit), ('"canvas" and "use_rdkit" cannot be ' 'both requested.') if structure is None: return [] matches = {} # SMARTs patterns are intra-molecular # so we iterate over molecules in struct if molecule_numbers is None: molecules = structure.molecule else: # Using enumerate as structure.molecule[x] is very slow #SHARED-6204 molecules = [ mol for mol in structure.molecule if mol.number in molecule_numbers ] for mol in molecules: new_to_old = \ dict(list(zip(list(range(1, len(mol.atom) + 1)), mol.getAtomIndices()))) try: if timing_data is not None: start = time.time() if canvas: match = evaluate_smarts_canvas(mol.extractStructure(), smarts, **kwargs) elif use_rdkit: uniquify = kwargs.get('unique_sets', True) match = adapter.evaluate_smarts(mol.extractStructure(), smarts, uniquify, RDKIT_MAX_MATCHES) else: match = evaluate_smarts(mol.extractStructure(), smarts, **kwargs) if timing_data is not None: end = time.time() timing_data.setdefault(len(mol.atom), []).append(end - start) # Remove empty matches and use structure atom indexes # rather than by-molecule atom indexes for a_match in match: if a_match: b_match = [new_to_old[x] for x in a_match] matches.setdefault(mol.number, []).append(b_match) except mm.MmException: pass if not matches_by_mol: matches = [ match for mol_match in list(matches.values()) for match in mol_match ] return matches
[docs]def evaluate_multiple_smarts(structure, smarts_list, verbose=False, first_match_only=False, unique_sets=False, keep_nested=False): """ Search for multiple SMARTS substructures in `Structure` `structure`. Return a list of lists of ints. Each list of ints is a list of atom indices matching a SMARTS pattern. The multiple SMARTS patterns are combined into one list. :type structure: `Structure` :param structure: Structure to search for matching substructures. :type smarts_list: list :param smarts_list: List of SMARTS patterns to look for. :type verbose: bool :param verbose: If True, print additional progress reports from the C implementation. :type first_match_only: bool :param first_match_only: If False, return all matches for a given starting atom - e.g. [1, 2, 3, 4, 5, 6] and [1, 6, 5, 4, 3, 2] from atom 1 of benzene with the smarts_expression 'c1ccccc1'. If True, return only the first match found for a given starting atom. Note that setting first_match_only to True does not affect matches with different starting atoms - i.e. benzene will still return six lists of ints for 'c1ccccc1', one for each starting atom. To match only once per set of atoms, use unique_sets=True. Note also that setting first_match_only to True does not guarantee that all matching atoms will be found. :type unique_sets: bool :param unique_sets: If True, the returned list of matches will contain a single (arbitrary) match for any given set of atoms. If False, return the uniquely ordered matches, subject to the behavior specified by the first_match_only parameter. :rtype: list :return: Each value is a list of atom indices matching the SMARTS patterns. """ if len(smarts_list) == 0: return [] return_list = [[] for idx in range(len(smarts_list))] mm.mmpatty_start_ct(structure, verbose) # Iterate over smarts patterns and get matches for each: for idx, smarts_expression in enumerate(smarts_list): if len(smarts_expression) == 0: continue try: p = mm.mmpatty_new(smarts_expression) except mm.MmException as e: if e.rc == mm.MMPATTY_INVALID_PATTERN: msg = 'Invalid smarts expression: "%s"' % smarts_expression raise ValueError(msg) matches = mm.mmpatty_match_comp(p, first_match_only, verbose) for match in matches: # List of atoms matching the smarts_expression: match_list = mmlist._mmlist_to_pylist(match) return_list[idx].append(match_list) mm.mmlist_delete(match) mm.mmpatty_delete(p) mm.mmpatty_finished_ct() if unique_sets: for idx in range(len(return_list)): matches = return_list[idx] filtered_list = [] unique = set() for item in return_list[idx]: key = tuple(sorted(item)) if key not in unique: filtered_list.append(item) unique.add(key) return_list[idx] = filtered_list if not keep_nested: ret = [match for matches in return_list for match in matches] return ret return return_list
[docs]def evaluate_substructure(st, subs_expression, first_match_only=False): """ Search for the MacroModel-style substructure expression in `Structure` `st` and return the atoms that match. For each match a list of atom indices is returned. :type st: `Structure` :param st: Structure to search for matching substructures. :type subs_expression: str :param subs_expression: MacroModel-style substructure expression to use for the search. :type first_match_only: bool :param first_match_only: If True, return only the first match found. :rtype: list :return: List of atom indices match the substructure expression. """ return_list = [] # I'm compiling the pattern here because it makes it easier to catch # pattern errors. p = mm.mmsubs_new(subs_expression) matches = mm.mmsubs_match_comp(p, st, first_match_only) for match in matches: return_list.append(mmlist._mmlist_to_pylist(match)) mm.mmlist_delete(match) mm.mmsubs_delete(p) return return_list
[docs]def generate_asl(st, atom_list): """ Generate and return an atom expression for the atoms in `Structure` `st` which are listed in `atom_list`. The ASL expression will be as compact as possible using mol, res and atom expressions where appropriate. :type st: `Structure` :param st: Structure holding the ASL atoms. :type atom_list: list :param atom_list: List of indices of atoms for which ASL is desired. :rtype: str :return: ASL compactly describing the atoms in atom_list. """ # Start by creating a bitset from the atom list bs = Bitset.from_list(st.atom_total, atom_list) return mm.mmasl_generate_asl(bs, int(st))
[docs]def generate_residue_asl(residues): """ Create an ASL representing the residues. Inscode will only be included if at least one of the residues has a non-blank inscode. :param residues: Residue objects to create ASL :type residues: collections.abc.Iterable(structure._Residue) :rtype: str """ # Note: # ASLs generated by this function are faster to evaluate than # `" OR ".join(res.getAsl() for res in residues)` # by ~30% on average and an order of magnitude with no inscodes rescodes_by_chain = collections.defaultdict(set) found_inscode = False for res in residues: inscode = res.inscode chain = res.chain if not chain.strip(): chain = f'"{chain}"' if inscode.strip(): found_inscode = True else: inscode = f'"{inscode}"' rescodes_by_chain[chain].add((res.resnum, inscode)) if found_inscode: def create_asl_for_chain(rescodes): per_res_asls = ( f'(res.num {num} AND res.i {ins})' for num, ins in rescodes) residues_str = " OR ".join(per_res_asls) return f'({residues_str})' else: def create_asl_for_chain(rescodes): number_str = ",".join(str(resnum) for resnum, _ in rescodes) return f'res.num {number_str}' chain_parts = { chain: create_asl_for_chain(sorted(rescodes)) for chain, rescodes in rescodes_by_chain.items() } asl = " OR ".join(f'(chain. {chain} AND {residues})' for chain, residues in sorted(chain_parts.items())) if not found_inscode: asl = f'({asl}) AND res.i " "' return asl
[docs]def validate_asl(asl): """ Validate the given ASL expression. This is useful for validating an ASL when a structure object is not available - for example when validating a command line option. NOTE: A warning is also printed to stdout if the ASL is not valid. :param asl: ASL expression. :type asl: str :return: True if ASL is valid, False otherwise. :rtype: bool """ try: bs = Bitset(size=0) st = create_new_structure() mm.mmasl_parse_input(asl, bs, st, 0) return True except mm.MmException: return False
[docs]def evaluate_asl(st, asl_expr): """ Search for substructures matching the ASL (Atom Specification Language) string `asl_expr` in `Structure` `st`. :type st: `Structure` :param st: Structure to search for matching substructures. :type asl_expr: str :param asl_expr: ASL search string. :rtype: list :return: List containing indices of matching atoms. :raise schrodinger.infra.mm.MmException: If the ASL expression is invalid. """ # Get the number of atoms in this CT num_atoms = mm.mmct_ct_get_atom_total(st) # Create a bitset object of that size: bs = Bitset(size=num_atoms) # Parse the input string using mmasl: mm.mmasl_parse_input(asl_expr, bs, st, 1) return list(bs)
[docs]def get_atoms_from_asl(st, asl_expr): """ Return atoms matching the ASL string `asl_expr` in `Structure` `st`. :type st: `Structure` :param st: Structure to search for matching substructures. :type asl_expr: str :param asl_expr: ASL search string. :rtype: generator :return: Generator of matching `StructureAtom` objects. :raise schrodinger.infra.mm.MmException: If the ASL expression is invalid. """ return (st.atom[index] for index in evaluate_asl(st, asl_expr))
[docs]def hydrogens_present(st): """ Return True if all hydrogens are present in `Structure` `st`, False otherwise. Since all modern force fields require hydrogens, this is a good check to make sure that a structure is ready for force field calculations. This function is implemented by checking to see if the structure can be used as-is in a calculation with OPLS2003. :warning: Requires atom types to be correct. Consider calling {Structure.retype} first. :type st: `Structure` :param st: Structure to be tested. :rtype: bool :return: Are all hydrogens are present? """ # What the mmhtreat function does is check whether the structure will # run with the specified forcefield. Calling the function with OPLS2003 # (which requires all atoms have hydrogens) is effectively a check that # all atoms have hydrogens. Since all modern force fields require # hydrogens, it is a reasonable check that a structure is ready for any # backend job. (ok, bs) = mm.mmhtreat_checkff("OPLS2003", st) if mm.mmbs_in_use(bs): mm.mmbs_delete(bs) return bool(ok)
[docs]def has_valid_lewis_structure(st: Structure) -> bool: """ Check whether a valid Lewis structure for the structure is possible. Possible causes of an invalid Lewis structure may be invalid bond orders, charges, or missing hydrogens or other atoms. This check may be useful before attempting to run any backend calculations on the given structure. """ mm.mmerr_initialize() mm.mmlewis_initialize(mm.MMERR_OFF) try: # with MMERR_OFF we may still get output to the terminal, so silence I/O with ioredirect.IOSilence(): mm.mmlewis_apply(st) return True except mm.MmException: valid = False finally: mm.mmlewis_terminate() mm.mmerr_terminate() return valid
[docs]def generate_tautomer_code(st, considerEZStereo=True, considerRSStereo=True, stereo=STEREO_FROM_ANNOTATION_AND_GEOM, strip=False): lazy_canvas_import() mol = None if isinstance(st, _canvas.ChmMol): mol = st else: mol = create_chmmol_from_structure(st, stereo=stereo) if strip: mol = _canvas.ChmPathTools.strip(mol) tautomerCode = mol.getTautomerCode64(considerEZStereo, considerRSStereo) mol = None return str(tautomerCode)
[docs]def create_chmmol_from_structure(structure, stereo=STEREO_FROM_ANNOTATION_AND_GEOM): """ Creates a ChmMol object for a given structure and returns the ChmMol. :param schrodinger.structure.Structure structure: The structure to create ChmMol from. :param enum stereo: Specify how to determine the stereochemistry of a ChmMol from a Structure. Can be STEREO_FROM_GEOMETRY, STEREO_FROM_ANNOTATION, STEREO_FROM_ANNOTATION_AND_GEOM, or NO_STEREO. See `schrodinger.structutils.smiles.SmilesGenerator.__init__` for descriptions of these options. :return schrodinger.application.canvas.base.ChmMol: The created ChmMol. """ lazy_canvas_import() adaptor = _canvas.ChmMmctAdaptor() mol = adaptor.create(structure.handle, smiles._translate_stereo_enum(stereo)) return mol
[docs]def generate_smiles(st, unique=True, stereo=STEREO_FROM_ANNOTATION_AND_GEOM): """:deprecated: Use `schrodinger.adapter.to_smiles` instead.""" global smiles if smiles is None: import schrodinger.structutils.smiles as smiles smiles_generator = smiles.SmilesGenerator(stereo, unique=unique) pattern = smiles_generator.getSmiles(st) return pattern
[docs]def generate_smarts(st, atom_subset=None, check_connectivity=True): """:deprecated: Use `schrodinger.adapter.to_smarts` instead.""" msg = ("analyze.generate_smarts() is deprecated. Use " "schrodinger.adapter.to_smarts() instead.") warnings.warn(msg, DeprecationWarning, stacklevel=2) bs = Bitset(size=len(st.atom)) if atom_subset: for at in atom_subset: bs.set(at) else: bs.fill() # Check for connectivity (EV:106255) if check_connectivity: if atom_subset: new_ct = mm.mmct_ct_extract_atoms(st.handle, bs) new_st = Structure(new_ct) fragments = new_st.mol_total else: fragments = st.mol_total if not fragments: raise ValueError('No atoms found for SMARTS generation.') elif fragments > 1: raise ValueError('SMARTS generation only works on connected sets ' 'of atoms.') mmsmiles_handle = mm.mmsmiles_new() pattern = '' pattern = mm.mmsmiles_get_smarts(mmsmiles_handle, st.handle, bs) mm.mmsmiles_delete(mmsmiles_handle) return pattern
[docs]def generate_smarts_canvas(st, atom_subset=None, check_connectivity=True, include_hydrogens=False, honor_maestro_prefs=False, include_stereo=False): """:deprecated: Use `schrodinger.adapter.to_smarts` instead.""" lazy_canvas_import() maestro = get_maestro() # Check for connectivity (EV:106255) if check_connectivity: if atom_subset is not None: fragments = st.extract(atom_subset).mol_total else: fragments = st.mol_total if not fragments: raise ValueError('No atoms found for SMARTS generation.') elif fragments > 1: raise ValueError('SMARTS generation only works on connected sets ' 'of atoms.') if include_hydrogens: wantH = _canvas.optionSMARTS.H_Always else: wantH = _canvas.optionSMARTS.H_Default # Prepare bitset of atoms to generate SMARTS pattern for: bs = Bitset(size=st.atom_total) if atom_subset is not None: for at in atom_subset: bs.set(at) if st.atom[at].atomic_number == 1: wantH = 1 else: pos = 1 for atom in st.atom: if include_hydrogens or atom.atomic_number != 1: bs.set(pos) pos = pos + 1 # Generate smarts pattern. smart_generator = _canvas.ChmMmctSmilesGenerator() markRings = False wantMapping = False useAtomicNumberOnAcyclicAtoms = False if honor_maestro_prefs: if not maestro: raise ValueError("honor_maestro_prefs option can be used only when " "running inside Maestro") def _get_bool_pref(option): return (maestro.get_command_option("prefer", option) == "True") wantCharge = _get_bool_pref("smartscharge") wantSubstituents = _get_bool_pref("smartssubstituents") include_stereo = _get_bool_pref("smartsstereochemistry") else: wantCharge = True wantSubstituents = False if wantSubstituents: wantX = _canvas.optionSMARTS.X_Always else: wantX = _canvas.optionSMARTS.X_No return smart_generator.getSubsetSMARTS(st.handle, bs, markRings, wantH, wantX, include_stereo, wantMapping, None, useAtomicNumberOnAcyclicAtoms, wantCharge)
[docs]def can_atom_hydrogen_bond(atom): """ Returns True if the given atom can be involved in a hydrogen bond. :param atom: Atom in question :type atom: `structure._StructureAtom` :return: Whether atom can H-bond :rtype: bool """ if atom.element == 'H': if len(atom.bond) != 1: return False neighbor = next(atom.bonded_atoms) return bool(mm.mmct_hbond_is_donor(atom.structure, neighbor)) else: return bool(mm.mmct_hbond_is_acceptor(atom.structure, atom))
[docs]def generate_crystal_mates(st, radius=10.0, space_group=None, a=None, b=None, c=None, alpha=None, beta=None, gamma=None, group_radius=14.0): """ Generate crystal mates for the input `Structure` `st`. Return a list of structures that represent the crystal mates. (Note that the first item in the list represents the identity transformation and as such will be identical to the input structure.) All crystal mates within `radius` of the input structure are generated. The crystal parameters can be specified as parameters to this function or can be standard PDB properties of the input structure. If the structure was read from a PDB file then these crystal properties will usually be present. The group_radius is used in the crystal mates calculation to determine whether a symmetric element is in contact with the ASU. There should be little reason to change the default value of 14.0. :type st: `Structure` :param st: Structure for which crystal mates will be generated. :type radius: float :param radius: Distance within which to generate crystal mates. :type space_group: str :param space_group: Space group of the crystal. If `None`, uses `st`'s s_pdb_PDB_CRYST1_Space_Group. :type a: float :param a: Crystal 'a' length. If `None`, uses `st`'s s_pdb_PDB_CRYST1_a. :type b: float :param b: Crystal 'b' length. If `None`, uses `st`'s s_pdb_PDB_CRYST1_b. :type c: float :param c: Crystal 'c' length. If `None`, uses `st`'s s_pdb_PDB_CRYST1_c. :type alpha: float :param alpha: Crystal 'alpha' angle. If `None`, uses `st`'s s_pdb_PDB_CRYST1_alpha. :type beta: float :param beta: Crystal 'beta' angle. If `None`, uses `st`'s s_pdb_PDB_CRYST1_beta. :type gamma: float :param gamma: Crystal 'gamma' angle. If `None`, uses `st`'s s_pdb_PDB_CRYST1_gamma. :type group_radius: float :param group_radius: Used to determine whether a symmetric element is in contact with the ASU. There should be little reason to change the default value of 14.0. :rtype: list :return: list of `Structure objects<Structure>` that represent the crystal mates. (Note that the first item in the list represents the identity transformation and as such will be identical to the input structure.) """ handle = -1 absent = "" try: if not space_group: absent = "Space Group" space_group = st.property['s_pdb_PDB_CRYST1_Space_Group'] if not a: absent = "a" a = st.property['r_pdb_PDB_CRYST1_a'] if not b: absent = "b" b = st.property['r_pdb_PDB_CRYST1_b'] if not c: absent = "c" c = st.property['r_pdb_PDB_CRYST1_c'] if not alpha: absent = "alpha" alpha = st.property['r_pdb_PDB_CRYST1_alpha'] if not beta: absent = "beta" beta = st.property['r_pdb_PDB_CRYST1_beta'] if not gamma: absent = "gamma" gamma = st.property['r_pdb_PDB_CRYST1_gamma'] except: raise Exception("Some crystal properties are not specified: %s" % absent) handle = mm.mmcrystal_new() mm.mmcrystal_set_symmetry(handle, space_group, a, b, c, alpha, beta, gamma) mm.mmcrystal_set_group_radius(handle, group_radius) mates = mm.mmcrystal_generate_mates(handle, st, radius) ret_list = [] for m in mates: ret_list.append(Structure(m)) mm.mmcrystal_delete(handle) return ret_list
[docs]def find_overlapping_atoms(st, ignore_hydrogens=False, ignore_waters=False, dist_threshold=0.8): """ Search the specified structure for overlapping atoms. Returns a list of (atom1index, atom2index) tuples. :type st: `Structure` :param st: Structure to search for overlapping atoms :param ignore_hydrogens: Whether to ignore hydrogens. :param ignore_waters: Whether to ignore waters. :param dist_threshold: Atoms are considered overlapping if their centers are within this distance of each other. :rtype: list :return: Each value is a tuple containing the indices of overlapping atoms. """ asl = "all" if ignore_waters: asl += " AND (NOT water)" if ignore_hydrogens: asl += " AND (NOT atom.ele H)" measure = schrodinger.structutils.measure if asl == "all": close_atoms = measure.get_close_atoms(st, dist_threshold) else: matched_atoms = evaluate_asl(st, asl) close_atoms = measure.get_close_atoms(st, dist_threshold, matched_atoms) return close_atoms
[docs]def generate_molecular_formula(st): """ Return a string for the molecular formula in Hill notation for the `st`. The structure must contain only one molecule. :type st: `Structure` :param st: Find the molecular formula for this structure. Must contain only one molecule. :rtype: str :return: The molecular formula for `st`. """ def get_formula_substr(element_str, element_count): """ :type element_str: str :param element_str: The element name :type element_count: int :param element_count: Atom count for the element. :rtype: str :return: String for a single element with name and number. """ formula_substring = "" if element_count == 0: formula_substring = "" elif element_count == 1: formula_substring = element_str else: formula_substring = "".join([element_str, repr(element_count)]) return formula_substring # add atomic symbol counts to dict formula_dict = {} # key: element, value: number of atoms for atom in st.atom: element = atom.element try: formula_dict[element] += 1 except KeyError: formula_dict[element] = 1 formula_list = [] # Parse the dict to get formula, C and H are special if 'C' in formula_dict: formula_list.append(get_formula_substr('C', formula_dict['C'])) if 'H' in formula_dict: formula_list.append(get_formula_substr('H', formula_dict['H'])) # parse the rest of the dict keys to build the rest of the formula for element in sorted(list(formula_dict)): if element not in ['C', 'H']: formula_list.append( get_formula_substr(element, formula_dict[element])) return "".join(formula_list)
def _may_atom_be_part_of_rotatable_bond(atom, allow_methyl=False): """ Determine whether an atom might be in a rotatable bond. :type atom: `_StructureAtom` :param atom: Atom to test for presence of rotatable bond. :type allow_methyl: bool :param allow_methyl: allow -CH3 and -NH3 as rotatable bonds, if True :rtype: bool :return: Might the atom be in a rotatable bond? """ # Internal function # if atom is hydrogen, bond will not be rotatable if atom.atomic_number == 1: return False # If has one connection it is a terminal atom and non-rotatable if atom.bond_total <= 1: return False # Check if all attachments but one to atom are H atom_numH = 0 triple_present = False for bond in atom.bond: if bond.atom2.atomic_number == 1: atom_numH += 1 if bond.order == 3: triple_present = True break # Bond adjacent to triple bonds are not counted as rotatable if triple_present: return False # if have three H and atoma is a carbon or nitrogen, not rotatable if not allow_methyl and atom_numH == 3 and atom.atomic_number in [6, 7]: return False return True def _is_bond_ring_edge(bond, rings): """ Determine whether a bond is a ring edge. :type bond: `_StructureBond` :param bond: Bond to test for presence at ring edges. :type rings: list :param rings: list of ring atom index lists. :rtype: bool :return: Is the bond a ring edge? """ # Internal function. Returns True if specified bond is a ring edge. atoma = bond.atom1 atomb = bond.atom2 # Get rid of all ring bonds except those between atoms on different rings, i.e. C6H11-C6H11 # Figure out if atoma-atomb bond is a ring edge: is_ring_edge = False for ringatoms in rings: ring = _Ring(bond._ct, 0, ringatoms, None) for edge in ring.edge: if (edge.atom1 == atoma) and (edge.atom2 == atomb): is_ring_edge = True break elif (edge.atom1 == atomb) and (edge.atom2 == atoma): is_ring_edge = True break if is_ring_edge: break if is_ring_edge: return True return False
[docs]def is_bond_rotatable(bond, rings=None, allow_methyl=False): """ Return True if specified bond is rotatable, False otherwise. A bond is considered rotatable if all of the following are true... 1. It is a single bond. 2. It is not adjacent to a triple bond. 3. It is not in a ring. 4. Neither atom is a hydrogen or other terminal atom. 5. Neither atom is a carbon or nitrogen with three hydrogens attached. :type bond: `structure.StructureBond` :param bond: bond to test for rotatability. :type rings: list :param rings: List of ring atom index lists. As an optimization, provide the (sorted) rings list from the find_rings function if you already have it. Otherwise, an SSSR calculation will be done. :type allow_methyl: bool :param allow_methyl: allow -CH3 and -NH3 as rotatable bonds, if True :rtype: bool :return: Is the bond rotatable? """ # This function uses the logic derived from Impact's rotbond routine # Only 1-order bonds are rotatable: if bond.order != 1: return False if not _may_atom_be_part_of_rotatable_bond(bond.atom1, allow_methyl=allow_methyl): return False if not _may_atom_be_part_of_rotatable_bond(bond.atom2, allow_methyl=allow_methyl): return False # If got this far, then bond may be a rotatable bond (if not in ring) # If rings are not specified, determine the rings and sort by connection: if rings is None: rings = bond._ct.find_rings(sort=True) if _is_bond_ring_edge(bond, rings): return False return True
[docs]def rotatable_bonds_iterator(st, rings=None, max_size=None): """ Return an iterator for rotatable bonds (atomnum1, atomnum2) in the structure. See the `is_bond_rotatable` function description for which bonds are considered rotatable. :type st: `Structure` :param st: The structure to search for rotatable bonds. :type rings: list :param rings: List of ring atom index lists. As an optimization, provide the (sorted) rings list from the find_rings function if you already have it. Otherwise, an SSSR calculation will be done. :type max_size: int :param max_size: If specified, yield only rings that have up to this number of bonds. Use this option to exclude large rings; e.g. those in macrocycle-like molecules. :rtype: iterator of tuples :return: yields tuples of atom index pairs describing a rotatable bond in `st`. """ # This function uses logic derived from Impact's rotbond routine. # This is Python version of get_num_rotors from ligparse_utils.c # > must explicitly exit to not count rotors, each bond is # > considered rotatable as the default # If rings are not specified, determine the rings and sort by connection: if rings is None: rings = st.find_rings(sort=True) if max_size is not None: rings = [r for r in rings if len(r) <= max_size] # For each atom in the structure: for atoma in st.atom: if not _may_atom_be_part_of_rotatable_bond(atoma): continue # Now loop over all attachments to atoma: for bond in atoma.bond: atomb = bond.atom2 # only want to check 1/2 the bond matrix if int(atomb) < int(atoma): continue # if bond between atomba and atomb is double or triple bond, not rotatable if bond.order != 1: continue if not _may_atom_be_part_of_rotatable_bond(atomb): continue # If got this far, then atoma-atomb bond may be a rotatable bond (if not in ring) if _is_bond_ring_edge(bond, rings): continue # not rotatable # If control makes it this far, it's a rotatable bond: yield (atoma.index, atomb.index)
# Will raise StopIteration when gets here.
[docs]def get_num_rotatable_bonds(st, rings=None, max_size=None): """ Return the number of rotatable bonds in the Structure `st`. The count does not include trivial rotors such as terminal methyls, or rotors within rings. :type st: `Structure` :param st: The structure to search for rotatable bonds. :type rings: list of lists of ints :param rings: List of ring atom index lists. As an optimization, provide the (sorted) rings list from the find_rings function if you already have it. Otherwise, an SSSR calculation will be done. :type max_size: int :param max_size: If specified, yield only rings that have up to this number of bonds. Use this option to exclude large rings; e.g. those in macrocycle-like molecules. :rtype: int :return: The number of rotatable bonds in `st`. """ # Comments moved down from the docstring during the docstring review. # This function returns the number of rotatable bonds in the st # using logic derived from Impact's rotbond routine # This is Python version of get_num_rotors from ligparse_utils.c # > must explicitly exit to not count rotors, each bond is # > considered rotatable as the default return sum(1 for item in rotatable_bonds_iterator( st, rings=rings, max_size=max_size))
[docs]def hbond_iterator(st, atoms=None): """ Iterate over hydrogen bond between the atoms specified by the `atom_set` and the other atoms in `st`. Each yielded item is a tuple of (atom-index-1, atom-index-2). NOTE: This function has been updated to simply act as a wrapper to `hbond.get_hydrogen_bonds` to ensure that hbonds are determined consistently. :type st: `Structure` :param st: The structure to search for H-bonds. :type atoms: list of int or None :param atoms: A list of atom indices (or _StructureAtom objects) to analyze. If not specified, then all H-bonds present in the structure are returned. :rtype: list of (`_StructureAtom`, `_StructureAtom`) :return: list of (donor atom object, acceptor atom object) for each hydrogen bond identified. """ msg = ("analyze.hbond_iterator() is deprecated. Use " "schrodinger.structutils.interactions.hbond.get_" "hydrogen_bonds() instead.") warnings.warn(msg, DeprecationWarning, stacklevel=2) return hbond.get_hydrogen_bonds(st, atoms1=atoms)
[docs]def find_equivalent_atoms(st, span_molecules=True): """ Find atoms in the structure that are equivalent. For example, all three hydrogens on a methyl group are equivalent. Returns a list, each value of which is a list of atoms that are equivalent. :type st: `Structure` :param st: The structure to search for equivalent atoms. :type span_molecules: bool :param span_molecules: If True, don't consider molecules to be separate entities. If False, constructs global equivalence classes for all atoms in a ct, but will never return an equivalence class across molecules. :rtype: list :return: Each value is a list of indices of equivalent atoms. """ sites = mm.mmsym_find_equivalent_sites_mol(st, not span_molecules) atoms_by_site = {} for i, sitenum in enumerate(sites): try: atoms_by_site[sitenum].append(i + 1) except KeyError: atoms_by_site[sitenum] = [i + 1] # Remove atoms that have no equivalent to them: for key, value in list(atoms_by_site.items()): if len(value) == 1: del atoms_by_site[key] return list(atoms_by_site.values())
[docs]def get_approximate_sasa(st, atom_indexes=None, cutoff=7.0): """ :deprecated: This function only returns a rough approximation to the solvent accessible surface area. Please use the `calculate_sasa` function instead. """ msg = "analyze.get_approximate_sasa() is deprecated. Use calculate_sasa() instead." warnings.warn(msg, DeprecationWarning, stacklevel=2) if atom_indexes is None: atom_indexes = list(range(1, st.atom_total + 1)) all_A_c = [] for iat in atom_indexes: A_c = get_approximate_atomic_sasa(st, iat, cutoff) all_A_c.append(A_c) structure_sasa = sum(all_A_c) return structure_sasa
[docs]def get_approximate_atomic_sasa(st, iat, cutoff=7.0, sasa_probe_radius=1.4, hard_sphere_s=2.5, scale_factor=2.32): """ :deprecated: Deprecated in favor of `calculate_sasa`, which is more accurate. """ msg = "analyze.get_approximate_atomic_sasa() is deprecated. Use calculate_sasa() instead." warnings.warn(msg, DeprecationWarning, stacklevel=2) # FIXME: The scale_factor should not be required (e.g. = 1), and # represents a flaw in my implementation. However, empirically I find # it gives OK results for both sucrose confs and lysozyme 1l47.pdb. # The scale_factor makes S larger when evaluating individual # probabilities. Without the factor, the product many near unity # probabilties (e.g. 0.98), result in a A' surface area that is # too small. r1_rad = st.atom[iat].radius S = 4 * math.pi * math.pow((r1_rad + sasa_probe_radius), 2) # atomic sasa all_prob = [] all_b_prime = [] # Loop over non-identity atoms, calculate b and b'. asl_expr = "not a.n %d and within %f a.n %d" % (iat, cutoff, iat) for jat in evaluate_asl(st, asl_expr): r2_rad = st.atom[jat].radius r1_r2_rad = r1_rad + r2_rad dist = st.measure(iat, jat) # b: maximal overlap Eq 3, the amount of iat's SA buried by jat. # if dist > (r1_r2_rad + 2*sasa_probe_radius) # then b should be zero. b = None if dist > (r1_r2_rad + 2 * sasa_probe_radius): b = 0 else: b = (math.pi * (r1_rad + sasa_probe_radius) * (r1_r2_rad + 2 * sasa_probe_radius - dist) * (1 + ((r2_rad - r1_rad) / dist))) # A negative b is non-physical so convert to 0.0. if b < 0: b = 0.0 # b': minimal overlap Eq 5, accounts for excluded volume counts. b_prime = (math.pi * (r1_rad + sasa_probe_radius) * (r1_r2_rad + 2 * sasa_probe_radius - hard_sphere_s - dist) * (1 + ((r2_rad - hard_sphere_s - r1_rad) / dist))) # A negative b_prime is non-physical so convert 0.0. if b_prime < 0: b_prime = 0.0 all_b_prime.append(b_prime) prob_j = 1 - ((b - b_prime) / (S * scale_factor)) all_prob.append(prob_j) # Loop over probabilities. product = 1 for prob_j in all_prob: product *= prob_j B_prime = sum(all_b_prime) # Eq 9. A_prime = S * product # Eq 7. # Eq 6. if A_prime < B_prime: A_c = 0 else: A_c = A_prime - B_prime logger.debug("%d(%s) %.2f, S %.2f, A' %.2f, B' %.2f, p(%d) %.5f" % (iat, st.atom[iat].element, A_c, S, A_prime, B_prime, len(all_prob), product)) return A_c
[docs]def calculate_sasa_by_atom(st, atoms=None, cutoff=8.0, probe_radius=1.4, resolution=0.2, exclude_water=False): """ Calculate the solvent-accessible surface area (SASA) for the whole structure, or an atom subset, and returns a list of floats. :type st: `Structure` :param st: Structure for which SASA is desired. :type atoms: list :param atoms: List of atom indices or of `_StructureAtom objects<_StructureAtom>` for the atoms to count. If None, calculates SASA for all atoms. (default: None) :type cutoff: float :param cutoff: Atoms within this distance of `atoms` will be considered for occlusion. Requires `atoms` to be specified. (default: 8.0A) :type probe_radius: float :param probe_radius: Probe radius, in units of angstroms, of the solvent molecule. (default: 1.4A) :type resolution: float :param resolution: Resolution to use. Decreasing this number will yield better results, increasing it will speed up the calculation. NOTE: This is NOT the same option as Maestro's surface resolution, which uses a different algorithm to calculate the surface area. (default: 0.2) :type exclude_water: bool :param exclude_water: If set to True then explicitly exclude waters in the method. This option is only works when 'atoms' argument is passed. (default: False) :rtype: list :return: A list of solvent accessible surface area of selected atoms in `st`. """ # NOTE: the calculateSASA() function is defined here: # mmshare/canvaslibs_ext/mm/ChmMmctAdaptor.cpp # # The Phase library that is used by the above function is defined here: # mmshare/phaselibs/feature/PhpSASArea.cpp # lazy_canvas_import() if atoms is None and not exclude_water: # Calculate SASA for the whole structure sasa_list = _canvas.ChmMmctAdaptor.calculateSASAByAtom( st.handle, probe_radius, resolution, False) else: # Calculate SASA for a subset of structure or exclude waters if atoms is None: atoms = st.getAtomIndices() atoms_asl = "all" else: # In case the user passed an atom iterator: atoms = list(map(int, atoms)) # If there are no atoms, return an empty list if not atoms: return [] atoms_asl = generate_asl(st, atoms) # Calculate SASA for a subset # Create a bitset of the atom subset: atoms_bitset = Bitset.from_list(st.atom_total, atoms) # Create a bitset of the occlusion set: close_asl = "(within %s %s)" % (cutoff, atoms_asl) if exclude_water: close_asl += " and not water" close_atoms = evaluate_asl(st, close_asl) consider_atoms_bitset = Bitset.from_list(st.atom_total, close_atoms) sasa_list = _canvas.ChmMmctAdaptor.calculateSASAByAtom( st.handle, atoms_bitset.handle, consider_atoms_bitset.handle, probe_radius, resolution, False) return sasa_list
[docs]def calculate_sasa_by_residue(st, atoms=None, cutoff=8.0, probe_radius=1.4, resolution=0.2, exclude_water=False): """ Calculate the solvent-accessible surface area (SASA) for the whole structure, or an atom subset, and then group them by residue. :type st: `Structure` :param st: Structure for which SASA is desired. :type atoms: list :param atoms: List of atom indices or of `_StructureAtom objects<_StructureAtom>` for the atoms to count. If None, calculates SASA for all atoms. (default: None) :type cutoff: float :param cutoff: Atoms within this distance of `atoms` will be considered for occlusion. Requires `atoms` to be specified. (default: 8.0A) :type probe_radius: float :param probe_radius: Probe radius, in units of angstroms, of the solvent molecule. (default: 1.4A) :type resolution: float :param resolution: Resolution to use. Decreasing this number will yield better results, increasing it will speed up the calculation. NOTE: This is NOT the same option as Maestro's surface resolution, which uses a different algorithm to calculate the surface area. (default: 0.2) :type exclude_water: bool :param exclude_water: If set to True then explicitly exclude waters in the method. This option is only works when 'atoms' argument is passed. (default: False) :rtype: list :return: a list of solvent accessible surface area of residues (ordered by connectivity) within `st`. """ sasa_list = calculate_sasa_by_atom(st, atoms=atoms, cutoff=cutoff, probe_radius=probe_radius, resolution=resolution, exclude_water=exclude_water) if atoms is None: atoms = st.getAtomIndices() else: atoms = list(map(int, atoms)) # now do all the accounting sasa_by_residue = [] for res in get_residues_by_connectivity(st): sasa = 0.0 res_matched = False for atom in res.atom: if atom.index in atoms: sasa += sasa_list[atom.index - 1] res_matched = True if res_matched: sasa_by_residue.append(sasa) return sasa_by_residue
[docs]def calculate_sasa(st, atoms=None, cutoff=8.0, probe_radius=1.4, resolution=0.2, exclude_water=False, exclude_atoms=None): """ Calculate the solvent-accessible surface area (SASA) for the whole structure, or an atom subset. :type st: `Structure` :param st: Structure for which SASA is desired. :type atoms: list :param atoms: List of atom indices or of `_StructureAtom objects<_StructureAtom>` for the atoms to count. If None, calculates SASA for all atoms. (default: None) :type cutoff: float :param cutoff: Atoms within this distance of `atoms` will be considered for occlusion. Requires `atoms` to be specified. (default: 8.0A) :type probe_radius: float :param probe_radius: Probe radius, in units of angstroms, of the solvent molecule. (default: 1.4A) :type resolution: float :param resolution: Resolution to use. Decreasing this number will yield better results, increasing it will speed up the calculation. NOTE: This is NOT the same option as Maestro's surface resolution, which uses a different algorithm to calculate the surface area. (default: 0.2) :type exclude_water: bool :param exclude_water: If set to True then explicitly exclude waters in the method. This option is only works when 'atoms' argument is passed. (default: False) :type exclude_atoms: list :param exclude_atoms: aid of atoms that you don't want in SASA caluclation useful for FEP-type systems where a second 'image' of a molecules is present :rtype: float :return: The solvent accessible surface area of the selected `atoms` within `st`. """ # NOTE: the calculateSASA() function is defined here: # mmshare/canvaslibs_ext/mm/ChmMmctAdaptor.cpp # # The Phase library that is used by the above function is defined here: # mmshare/phaselibs/feature/PhpSASArea.cpp # lazy_canvas_import() if atoms is None: # Calculate SASA for the whole structure sasa = _canvas.ChmMmctAdaptor.calculateSASA( st.handle, probe_radius, resolution, False, # excludeH=False - For consistency with Maestro ) else: # In case the user passed an atom iterator: atoms = list(map(int, atoms)) # Calculate SASA for a subset # Create a bitset of the atom subset: atoms_bitset = Bitset.from_list(st.atom_total, atoms) #TODO: need to explicitly exclude the other molecule if FEP system # Create a bitset of the occlusion set: atoms_asl = "(atom.num %s)" % ', '.join(map(str, atoms)) close_asl = "(within %s %s)" % (str(cutoff), atoms_asl) if exclude_water: close_asl += " and not water" if exclude_atoms: close_asl += ' and not (atom.num %s)' % \ ', '.join(map(str, exclude_atoms)) close_atoms = evaluate_asl(st, close_asl) if len(close_atoms) == len(atoms): newst = st.extract(atoms) sasa = _canvas.ChmMmctAdaptor.calculateSASA(newst.handle, probe_radius, resolution, False) else: consider_atoms_bitset = Bitset.from_list(st.atom_total, close_atoms) sasa = _canvas.ChmMmctAdaptor.calculateSASA( st.handle, atoms_bitset. handle, # const int& mmbsSubsetAtomsCount, //these are considered in sum consider_atoms_bitset. handle, # const int& mmbsSubsetAtomsConsider, //these are considered for occlusion probe_radius, resolution, False) if math.isnan(sasa): sasa = 0.0 return sasa
[docs]def calc_buried_sasa_by_residue(st, group1_atoms, group2_atoms, resolution=0.5): """ Calculate the buried SASA ratio (delta SASA upon binding) for each residue, which measures how much of the residue's surface is interacting with the other binding partner. Value of 1.0 means that all of residue's area, as calculated in a subunit, is no longer accessible to solvent when in complex with the other group. Value of 0.0 means that all of that residue is accessible as a complex as well (residue is not on the interaction surface). Value of 0.0 is also used for residues that are fully buried in their subunits. :param st: Structure object :type st: `structure.Structure` :param group1_atoms: Atoms of the first binding partners group. :type group1_atoms: list of ints :param group2_atoms: Atoms of the second binding partners group. :type group2_atoms: list of ints :param resolution: Resolution to use. See calculate_sasa_by_atom(). :type resolution: float :return: Dictionary where keys are residue strings (e.g. "A:123"), and values are the buried SASA ratio for that residue (0.0-1.0). :rtype: dict """ # Initialize the output dict: buried_sasa_by_res = {str(res): 0.0 for res in st.residue} def calc_unbound_sasa(st, group_atoms): del_atoms = [ ai for ai in range(1, st.atom_total) if ai not in group_atoms ] group_st = st.copy() renumber_map = group_st.deleteAtoms(del_atoms, renumber_map=True) inv_renumber_map = {new: orig for orig, new in renumber_map.items()} sasa_by_atom = calculate_sasa_by_atom(group_st, resolution=resolution) sasa_by_orig_atom = {} for new_anum, sasa in enumerate(sasa_by_atom, start=1): orig_anum = inv_renumber_map[new_anum] sasa_by_orig_atom[orig_anum] = sasa return sasa_by_orig_atom # Calculate the SASA in context of the whole structure/complex: all_sasa_by_atom = calculate_sasa_by_atom(st, resolution=resolution) # Calculate SASA for individual groups: group1_sasas = calc_unbound_sasa(st, group1_atoms) group2_sasas = calc_unbound_sasa(st, group2_atoms) unbound_sasa_by_atom = { anum: sasa for anum, sasa in list(group1_sasas.items()) + list(group2_sasas.items()) } for res in st.residue: bound_sasa = 0.0 unbound_sasa = 0.0 for atom in res.atom: if atom.index in group1_atoms or atom.index in group2_atoms: bound_sasa += all_sasa_by_atom[atom.index - 1] # 0-indexed unbound_sasa += unbound_sasa_by_atom[atom.index] # 1-indexed if unbound_sasa == 0.0: # completely buried, can't interact with the other group ratio = 0.0 else: ratio = 1.0 - (bound_sasa / unbound_sasa) buried_sasa_by_res[str(res)] = ratio return buried_sasa_by_res
[docs]def find_ligands(st, **kwargs): """ Simple function interface for `AslLigandSearcher` class. :type st: `Structure` :param st: Structure to search. :rtype: list :return: a list of `Ligand` instances for putative ligands within st. """ asl_searcher = AslLigandSearcher(**kwargs) return asl_searcher.search(st)
[docs]def center_of_mass(st, atom_indices: list = None): """ Gets the structure's center of mass. If specified, this can be limited to a subset of atoms. NOTE: Periodic boundary conditions (PBC) are NOT honored. :param st: structure :type st: structure.Structure :param atom_list: 1-based atom indices :type atom_list: list(int) :return: centroid given as 3-element array [x, y, z] :rtype: numpy.array(float) See schrodinger/geometry/centroid.h """ if atom_indices: bs = mmbitset.Bitset.from_list(st.atom_total, atom_indices) xyz = geometry.center_of_mass(st, bs) else: xyz = geometry.center_of_mass(st) # hack to fix wrapping and return [x, y, z] return xyz[:, 0]
[docs]def radius_of_gyration( st: "structure.Structure", # noqa: F821 atom_indices: Optional[List[int]] = None, mass_weighted: bool = False) -> float: """ Calculate radius of gyration (R_gyr or Rg) is a measure of the size of an object of arbitrary shape. NOTE: Periodic boundary conditions (PBC) are NOT honored. :return: float value in Angstrom units """ atom_indices = atom_indices or list(range(1, st.atom_total + 1)) # Calculate the centroid com = numpy.array(center_of_mass(st, atom_indices=atom_indices)) # Calculate the radius of gyration. R^2 = (1/N)*sum((r_i - r_mean)^2) xyz = numpy.array([st.atom[i].xyz for i in atom_indices]) r2 = numpy.square(xyz - com).sum(axis=1) weights = [st.atom[i].atomic_weight for i in atom_indices] \ if mass_weighted else [1] * len(atom_indices) # Calculate the sq distance between the atom and the centroid. r_gyr = numpy.sqrt(numpy.average(r2, weights=weights)) return r_gyr
[docs]def calculate_principal_moments(struct=None, atoms=None, massless=False): """ Calculate the principal moments of inertia for a list of atoms. This is calculated with respect to the x, y, and z coordinates of the atom's center of mass. :type struct: `Structure` :param struct: If given the moments will be calculated for the entire structure. This overrides any atoms given with the atoms keyword. Either atoms or structure must be given. :type atoms: list :param atoms: list of `schrodinger.structure._StructureAtom` objects. Atom objects to compute the tensor for. Either atoms or structure must be given. :type massless: bool :param massless: True if the calculations should be independent of the atomic masses (all mass=1), False (default) if atomic mass should be used. :rtype: tuple :return: A tuple of (eigenvalues, eigenvectors) of the inertial tensor. The eigenvalues are the principle moments of inertia and are a list of length 3 floats. The eigenvectors are a list of lists, each inner list is a list of length 3 floats. """ def get_distance(pt1, pt2): """ Get the distance between two "points". The points need to be numpy arrays with 3 floats in each. :type pt1: 3 element `numpy array<numpy.ndarray>` :param pt1: first point :type pt2: 3 element `numpy array<numpy.ndarray>` :param pt2: second point :rtype: float :return: The distance between `pt1` and `pt2` """ c = pt2 - pt1 return numpy.sqrt(numpy.sum(c * c)) if struct: atoms = [x for x in struct.atom] total_atoms = len(atoms) # First we calculate the center of mass. We do not use # a separate function for this since we do not want to # grab the atom properties more than once since this # significantly slows the process down atom_mass = numpy.zeros(total_atoms) atom_xyz = numpy.zeros((total_atoms, 3)) for i in range(0, total_atoms): if massless: atom_mass[i] = 1 else: atom_mass[i] = atoms[i].atomic_weight atom_xyz[i] = numpy.array(atoms[i].xyz) # Multiply each [x,y,z] by the mass of that atom mass_weighted = atom_xyz * atom_mass[:, numpy.newaxis] com = numpy.sum(mass_weighted, axis=0) / numpy.sum(atom_mass) shape = (3, 3) indices = numpy.ndindex(*shape) tensor = numpy.zeros(shape) for i, j in indices: delta = int(i == j) elem = numpy.zeros(total_atoms) for idx in range(0, total_atoms): vec = atom_xyz[idx] - com r2d = (get_distance(atom_xyz[idx], com)**2) * delta elem[idx] = atom_mass[idx] * (r2d - vec[i] * vec[j]) tensor[i][j] = numpy.sum(elem) moments, vectors = numpy.linalg.eigh(tensor) moments = moments.tolist() vectors = vectors.tolist() axes = [] for axis in range(3): axes.append([x[axis] for x in vectors]) return moments, axes
[docs]def get_largest_moment_normalized_vector(**kwargs): """ Return the normalized eigenvector of the largest moment of inertia. This will be the vector normal to the plane of the largest moment. See calculate_principal_moments for parameters. :rtype: `numpy.array` :return: The normalized vector for the largest moment of inertia """ vals, vecs = calculate_principal_moments(**kwargs) eigvalvecs = list(zip(vals, vecs)) largest_moment, largest_vector = max(eigvalvecs) ivec = numpy.array(largest_vector) inertial_vec = transform.get_normalized_vector(ivec) return inertial_vec
[docs]def find_shortest_bond_path(struct, index1, index2, atom_ids=None): """ Find the shortest path of bonded atoms that connects atom1 to atom2 The conversion of this routine to use networkx rather than scipy resulted in a dramatic reduction in both time and memory usage. :type struct: `schrodinger.structure.Structure` :param struct: The structure containing the atoms index1 and index2 :type index1: int :param index1: The index of the first atom in the path :type index2: int :param index2: The index of the second atom in the path :type atom_ids: the atom_ids to search path from :param atom_ids: list of int :rtype: list :return: A list of indexes of atoms that connect atom index1 to atom index2 along the shortest bond path. Index1 will be the first item in the list and index2 will be the last. The second item in the list will be bonded to index1, the third will be bonded to the second, etc. If index1 == index2, a single item list is returned: [index1] :raise ValueError: if index1 and index2 are not part of the same molecule :raise MemoryError: if the system is too large """ if index2 == index1: return [index1] atom1 = struct.atom[index1] mol = atom1.getMolecule() if atom_ids is None: atom_ids = mol.getAtomIndices() if index2 not in atom_ids: # Manually raise a value error rather than go through the whole process # and have networkx raise a networkx.exception.NetworkXNoPath error raise ValueError('Both atoms must be part of the same molecule') nx_graph = networkx.Graph() for atom_id in atom_ids: atom = struct.atom[atom_id] for neighbor in atom.bonded_atoms: if neighbor.index > atom.index: nx_graph.add_edge(atom.index, neighbor.index) path = networkx.shortest_path(nx_graph, index1, index2) return path
[docs]def create_nx_graph(struct, atoms=None): """ Generate a networkx undirected graph of the structure based on bonds :type struct: `schrodinger.structure.Structure` :param struct: the structure to create a graph of :type atoms: iterable :param atoms: Optionally, graph edges will be restricted to this group of atoms - items are `schrodinger.structure._StructureAtom` objects or atom indexes :rtype: `networkx.Graph` :return: An undirected graph of the structure with edges in place of each bond. Edges are identical regardless of the bond order of the bond. """ if struct is None: raise RuntimeError('struct must be supplied to create a graph') nx_graph = networkx.Graph() if not atoms: for bond in struct.bond: nx_graph.add_edge(bond.atom1.index, bond.atom2.index) else: # Convert any indexes to atom objects and create a set atom_objs = set() for atom in atoms: atom_objs.add(struct.atom[int(atom)]) for atom in atom_objs: aindex = atom.index for neighbor in atom.bonded_atoms: if neighbor in atom_objs and neighbor.index > aindex: nx_graph.add_edge(aindex, neighbor.index) return nx_graph
[docs]def improper_dihedral_iterator(struct=None, atoms=None, nx_graph=None, include_proper_improper=True, include_proper=True): """ An iterator over all the improper dihedral angles in a structure or group of atoms. :type struct: `schrodinger.structure.Structure` :param struct: The structure to find improper dihedrals in. Either struct or nx_graph must be given. :type atoms: iterable :param atoms: Optionally, improper dihedrals will be restricted to this group of atoms - items are `schrodinger.structure._StructureAtom` objects or atom indexes :type graph: `networkx.Graph` :param graph: A networkx graph of the structure with edges representing bonds. If not supplied one will be generated. If graph is supplied, struct and atoms are ignored. :type include_proper_improper: bool :param include_proper_improper: whether to include improper dihedrals that define the same degree-of-freedom defined by a proper dihedral obtained from torsion_iterator, for example in the digrams below the neighboring atoms are not bound and so the defined impropers offer new degrees-of-freedom but if they were bound (suppose (R'',12) was bound to (R',26) in the first diagram) the degree-of-freedom defined by the improper (7, 37, 12, 26) is redundant with that defined by the proper (7, 37, 12, 26) obtained from torsion_iterator, this boolean controls whether such impropers can be returned :type include_proper: bool :param include_proper: whether to include proper dihedrals obtained from torsion_iterator that define a new degree-of-freedom not defined by any improper dihedral, for example in the digram below suppose (R'',12) was bound to (R',26) and the bond between (X,37) and (R',26) did not exist, this boolean controls whether such propers can be returned as impropers :rtype: tuple :return: Each iteration yields a 4-integer tuple of atom indexes for an improper dihedral. For each given quadruple all unique topologies are enumerated. For example, for a standard quadruple ordering (i,j,k,l) all 6 of the following topologies are returned: (1) (i,j,k,l) (2) (j,i,k,l) # switch 1,2 (3) (i,j,l,k) # switch 3,4 (4) (j,i,l,k) # switch 1,2 and 3,4 (5) (k,j,i,l) # switch 1,3 (6) (i,l,k,j) # switch 2,4 The standard quadruple ordering (i,j,k,l) considers the indices of the first three atoms as the central atom plus the two lowest index bonding atoms in bond-path ordering as lowest bonding atom, central atom, other bonding atom. The last index in the quadruple is the remaining atom which is not bonded to the last atom in the previously mentioned triple but is bonded to the central atom. If calling with include_proper then those quadruples do not have their topologies enumerated and the ordering will be such that the index of the first atom in the tuple will be smaller than the index of the last atom in the tuple. For example:: (R,7) \ (X,37)-(R',26) / (R'',12) (1) (7, 37, 12, 26) (standard) (2) (37, 7, 12, 26) (3) (7, 37, 26, 12) (4) (37, 7, 26, 12) (5) (12, 37, 7, 26) (6) (7, 26, 12, 37) or:: (R,34) \ (X,1)-(R',4) / (R'',78) (1) (4, 1, 34, 78) (standard) etc. """ # for coarse-grain structures bonding patterns can be odd # relative to an atomistic viewpoint and so be general, # for N atoms bonded to a central atom the number of unique # improper dihedrals is the binomial coefficient N-choose-3 # or (N 3) = N!/((N-3)!3!), if not include_proper_improper then # skip cases where neighbors bonded to the same central atom are # also bonded to each other as those degrees-of-freedom are # covered with normal proper torsions (see torsion_iterator), # with the function name improper_dihedral_iterator running with # include_proper is technically confusing as such proper dihedrals # are not at all related to improper dihedrals but offered here # anyway due to reasons discussed in MATSCI-4891 if nx_graph is None: nx_graph = create_nx_graph(struct, atoms=atoms) get_topologies = lambda i, j, k, l: [(i, j, k, l), (j, i, k, l), (i, j, l, k), (j, i, l, k), (k, j, i, l), (i, l, k, j)] for ind2 in nx_graph.nodes(): neighbors = list(nx_graph.neighbors(ind2)) if len(neighbors) >= 3: for triple in itertools.combinations(neighbors, 3): redundant = False if not include_proper_improper: for pair in itertools.combinations(triple, 2): if nx_graph.has_edge(*pair): redundant = True break if not redundant: ind1, ind3, ind4 = sorted(triple) for quadruple in get_topologies(ind1, ind2, ind3, ind4): yield quadruple # nothing is done here to ensure that the quadruples yielded here # have not already been yielded as cases where the degree-of-freedom # could be described as either improper or proper, so it is the # callers responsibility to dedup if necessary if include_proper: for quadruple in torsion_iterator(nx_graph=nx_graph): yield quadruple
[docs]def torsion_iterator(struct=None, atoms=None, nx_graph=None): """ An iterator over all the bonded torsions in a structure or group of atoms. :type struct: `schrodinger.structure.Structure` :param struct: The structure to find torsions in. Either struct or nx_graph must be given. :type atoms: iterable :param atoms: Optionally, torsions will be restricted to this group of atoms - items are `schrodinger.structure._StructureAtom` objects or atom indexes :type graph: `networkx.Graph` :param graph: A networkx graph of the structure with edges representing bonds. If not supplied one will be generated. If graph is supplied, struct and atoms are ignored. :rtype: tuple :return: Each iteration yields a 4-integer tuple of atom indexes for a dihedral formed by bonded atoms. The index of the first atom in the tuple will be smaller than the index of the last atom in the tuple. """ if nx_graph is None: nx_graph = create_nx_graph(struct, atoms=atoms) # In testing over molecules from 12 to 161,000 atoms, this networkx # implementation was consistently 3X faster than a similar ct-based # algorithm for ind2, ind3 in nx_graph.edges(): # nx_graph[x] returns all the nodes attached to x for ind1 in nx_graph[ind2]: if ind1 != ind3: for ind4 in nx_graph[ind3]: # 4 != 1 avoids three-membered rings if ind4 != ind2 and ind4 != ind1: if ind1 < ind4: yield (ind1, ind2, ind3, ind4) else: yield (ind4, ind3, ind2, ind1)
[docs]def angle_iterator(struct=None, atoms=None, nx_graph=None): """ An iterator over all the bonded angles in a structure or group of atoms. :type struct: `schrodinger.structure.Structure` :param struct: The structure to find angles in. Either struct or nx_graph must be given. :type atoms: iterable :param atoms: Optionally, angles will be restricted to this group of atoms - items are `schrodinger.structure._StructureAtom` objects or atom indexes :type graph: `networkx.Graph` :param graph: A networkx graph of the structure with edges representing bonds. If not supplied one will be generated. If graph is supplied, struct and atoms are ignored. :rtype: tuple :return: Each iteration yields a 3-integer tuple of atom indexes for an angle formed by bonded atoms. The index of the first atom in the tuple will be smaller than the index of the last atom in the tuple. """ if nx_graph is None: nx_graph = create_nx_graph(struct, atoms=atoms) # See torsion_iterator for a comment on the speed of using networkx vs # ct-based algorithm for ind2 in nx_graph.nodes(): neighbors = nx_graph.neighbors(ind2) for ind1, ind3 in itertools.combinations(neighbors, 2): if ind1 < ind3: yield (ind1, ind2, ind3) else: yield (ind3, ind2, ind1)
[docs]def bond_iterator(struct=None, atoms=None, nx_graph=None): """ An iterator over all the bonds in a structure or group of atoms. Note: It may seem unnecessary to have this function as one must iterate over bonds to form the nx_graph that is then iterated over in this function. However, it may be the case that an nx_graph has already been created for other reasons such as when iterating over all bonds, angles and torsions. :type struct: `schrodinger.structure.Structure` :param struct: The structure to find bonds in. Either struct or nx_graph must be given. :type atoms: iterable :param atoms: Optionally, bonds will be restricted to this group of atoms - items are `schrodinger.structure._StructureAtom` objects or atom indexes :type graph: `networkx.Graph` :param graph: A networkx graph of the structure with edges representing bonds. If not supplied one will be generated. If graph is supplied, struct and atoms are ignored. :rtype: tuple :return: Each iteration yields a 2-integer tuple of atom indexes for a bond formed by bonded atoms. The index of the first atom in the tuple will be smaller than the index of the last atom in the tuple. """ if nx_graph is None: nx_graph = create_nx_graph(struct, atoms=atoms) # See torsion_iterator for a comment on the speed of using networkx vs # ct-based algorithm for ind1, ind2 in nx_graph.edges(): if ind1 < ind2: yield (ind1, ind2) else: yield (ind2, ind1)
[docs]def get_average_structure(sts): """ Calculate the average structure between the given conformers. :type sts: Iterable of `structure.Structure` objects :param sts: Structures to average :rtype: `structure.Structure` :return: Average structure """ coord_sums = None average_st = None for i, st in enumerate(sts, start=1): st_coords = st.getXYZ() if i == 1: coord_sums = st_coords average_st = st.copy() else: if st.atom_total != average_st.atom_total: raise ValueError("Input structures must be conformers") coord_sums = numpy.sum((coord_sums, st_coords), axis=0) if coord_sums is None: raise ValueError("Input sts can not be empty") average_coords = numpy.divide(coord_sums, i) average_st.setXYZ(average_coords) return average_st
[docs]def find_common_substructure(sts, atomTyping=11, allow_broken_rings=True): """ Find the maximum substructure that is common between all specified CTs. If any of the structures matches the substructure SMARTS more than once, then all matches are reported - that is why output a "triple" list. Outer list represents input structure, next list represents matches, and inner list is list of atom indices for that match. It's up to the calling code to decide which of the multiple matches to use (one method is to use the one whose center-of-mass is closest to the COM of the whole ligand). NOTE: This function becomes exponentioally slow with larger number of structures. Recommened maximum around 30 structures. NOTE: This function checks CANVAS_SHARED exists and checks out CANVAS_FULL :type sts: Iterable of `structure.Structure` objects :param sts: Structures to average :type atomTyping: int :param atomTyping: Atom typing scheme to use. For list of available schemes, see $SCHRODINGER/utilities/canvasMCS -h :type allow_broken_rings: bool :param allow_broken_rings: Whether to allow partial mapping of rings :rtype: List of list of list of ints :return: Substructure atoms from each structure. Outer list represents input structures - in order of input; middle list represents matches, inner list represents atom indices for that match. """ settings = canvas.MCSsettings() settings.atomTyping = atomTyping if not allow_broken_rings: settings.nobreakring = True settings.nobreakaring = True adaptor = canvas.ChmMmctAdaptor() mols = [adaptor.create(st.handle) for st in sts] out_dict = collections.OrderedDict() for item in canvas.runMCS(mols, settings): match_atoms = list(item.mapAtoms) try: out_dict[item.molID].append(match_atoms) except KeyError: out_dict[item.molID] = [match_atoms] assert list(out_dict) == list(range(len(mols))) return list(out_dict.values())
[docs]def group_by_connectivity(st, atoms): """ Groups the atoms by molecule connectivity. Returns a list of atom groups. Each group is a list of atoms that are in the same "molecule" - that are bonded to each other, counting only atoms in specified list. If multiple atoms are in the same molecule, but are separated by atoms that are not in the list (e.g. 2 covalent ligands bound to same protein), they will be grouped separately. :param st: Structure that atoms are from. :type st: `structure.Structure` :param atoms: List of atom indices that are to be grouped. :type atoms: list of ints """ unassigned_atoms = atoms[:] def walk_the_atom(atom, group_atoms, checked_atoms): ai = atom.index group_atoms.add(ai) for bond in atom.bond: if bond.order == 0: continue natom = bond.atom2 nindex = natom.index if nindex in checked_atoms: continue checked_atoms.add(nindex) if nindex in unassigned_atoms: unassigned_atoms.remove(nindex) walk_the_atom(natom, group_atoms, checked_atoms) groups = [] while unassigned_atoms: # Take the next unassigned atom, and find all atoms in the same # "molecule group" as that atom: aindex = unassigned_atoms[0] unassigned_atoms.remove(aindex) checked_atoms = {aindex} group_atoms = set() walk_the_atom(st.atom[aindex], group_atoms, checked_atoms) groups.append(sorted(group_atoms)) return groups
[docs]def find_common_properties(sts: Iterable[Structure]) -> Set[str]: """ Return a set of property names that are common to all selected structures. :type sts: Iterable of `structure.Structure` objects :param sts: Structures to analyze :return: set of property data names :rtype: set of str """ common_props = None for st in sts: st_props = set(list(st.property)) if common_props is None: common_props = st_props else: # Exclude properties that are not present in this row common_props.intersection_update(st_props) if not common_props: break # No common properties - stop the search if not common_props: # No input structures return set() return common_props
[docs]def read_seqres_from_ct(in_ct): """ ct {schrodinger.Structure} Input ct to process Read the SEQRES data from a ct and return as a pair of lists with the same size. The first has the chain names and the second the sequences ie ['A'] and ['ALA ALA ALA '] """ # This is necessary as the actions here adjust the ct in some unknown way ct = in_ct.copy() # Get the PDB sequence strings for each chain: try: data_handle = mm.mmct_ct_m2io_get_unrequested_handle(ct) except mm.MmException: data_handle = mm.mmct_ct_get_or_open_additional_data(ct, True) num_seqres_blocks = mm.m2io_get_number_blocks(data_handle, "m_PDB_SEQRES") if num_seqres_blocks == 0: return [], [] # get sequences from SEQRES chain:seqres out_chains = [] out_seqres = [] mm.m2io_goto_block(data_handle, "m_PDB_SEQRES", 1) num_rows = mm.m2io_get_index_dimension(data_handle) for i in range(1, num_rows + 1): chain_strings = mm.m2io_get_string_indexed(data_handle, i, ["s_pdb_chain_id"]) seqres_strings = mm.m2io_get_string_indexed(data_handle, i, ["s_pdb_SEQRES"]) out_chains.append(chain_strings[0]) out_seqres.append(seqres_strings[0]) return out_chains, out_seqres
[docs]def seq_align_match(fullseq, fragseq, pdbnum, breaklist=None, allow_frag_gaps=False): """ restricted Needleman-Wunsch :param fullseq: Full sequence to work with. Positions in the alignment that match this and not fragseq are given light penalities. Positions in the alignment that fragseq and not this are either not allowed (allow_frag_gaps=True) or have large penalties. This is intended to be the full protein sequence (from the seqres records) when alignining protein full protein sequences to those actually resolved in the experiment. :type fullseq: str :param fragseq: Fragment sequence to work with. This is intended to be the fragment of the protein sequence actually resolved in the experiment (ATOM records) when aligning full protein sequences to those actually resolved in the experiment. :type fragseq: str :param pdbnum: list of tuples with a integer and a character with the same length as fragseq ie [(1, ' ', (1, 'A'), (2, ' ')]} The residue numbers and insertion codes of the residues in fragseq. This allows for the gap penalties to be disregarded when the residue number suggests a gap. :type pdbnum: list(tuple) :param breaklist: { list of Boolean with the same length as fragseq } True values in this list mean that there is a known break after that residue in fragseq so gap penalties are disregarded. :type breaklist: list(bool) :param allow_frag_gaps: see fullseq for a description :type allow_frag_gaps: bool Return value is a string with the same length as the alignment. A M will be at any position that matches the fullseq and fragseq. A U will be at any position that mathces fullseq, but not fragseq. A R will be at any position that mathces fragseq, but not fullseq. """ lfull = len(fullseq) lfrag = len(fragseq) if lfrag == 0: raise RuntimeError('Zero-length sequence in alignment') if lfull - lfrag < 0 and not allow_frag_gaps: raise RuntimeError('Full sequence must be greater or equal to fragment') score = numpy.zeros((lfull, lfrag)) trace = numpy.zeros((lfull, lfrag)) # Fill up the matrix for ifull, cfull in enumerate(fullseq): for ifrag, cfrag in enumerate(fragseq): # If there is a match (no mismatch allowed?) is_match = False if (ifull == 0 or ifrag == 0): score_match = 0 else: score_match = score[ifull - 1, ifrag - 1] if (cfull == cfrag and cfull != 'X'): score_match += 10 is_match = True elif (cfull == 'X' and cfrag == 'X'): score_match += 0 else: # The mismatch score should be HIGH so that we don't # Allow many prefer a pretty big gap over a mismatch score_match += -50 # Gaps in frag are only allowed for 'X' if cfrag == 'X' and allow_frag_gaps: if ifrag == 0: score_gap_frag = 0 else: score_gap_frag = score[ifull, ifrag - 1] - 5 else: score_gap_frag = None # If there is a gap in the full (no gaps allowed in frag) if (ifull == 0): score_gap_full = 0 gap_open_penalty_full = 0 else: score_gap_full = score[ifull - 1, ifrag] # Deal with gap penalties if (trace[ifull - 1, ifrag] == 1): if (ifrag >= len(fragseq) - 1): # There' no penalty for opening a gap at the # end of the fragment gap_open_penalty_full = 0 elif (breaklist and not breaklist[ifrag + 1]): # Skip the gap open penalty if we know there # should be a gap here. gap_open_penalty_full = 0 else: # Default gap penalty is 40 gap_open_penalty_full = 40 # If the residue numbers on either side of the gap # are contigious set up a gap-open penalty of 30 # otherwise set a gap open penalty of 40 to favor # alignments that match the residue numbering scheme try: if (pdbnum[ifrag + 1][0] - pdbnum[ifrag][0] > 1): gap_open_penalty_full = 30 # If the pdb numbers are not given in the input # (TypeError on the minus) or we are at the end of # the list (IndexError) then use the bigger penalty except (TypeError, IndexError): pass else: # not the start of a gap gap_open_penalty_full = 0 score_gap_full = score_gap_full - gap_open_penalty_full if (score_match >= score_gap_full and (score_gap_frag is None or score_match >= score_gap_frag)): score[ifull, ifrag] = score_match trace[ifull, ifrag] = 1 elif (score_gap_frag is None or score_gap_full >= score_gap_frag): score[ifull, ifrag] = score_gap_full trace[ifull, ifrag] = 2 else: score[ifull, ifrag] = score_gap_frag trace[ifull, ifrag] = 3 # Start from the bottom of the matrix and fill out the tracebacks map = [] ifull = len(fullseq) - 1 ifrag = len(fragseq) - 1 while (ifull >= 0 and ifrag >= 0): if (trace[ifull, ifrag] == 1): map.append("M") ifull -= 1 ifrag -= 1 elif (trace[ifull, ifrag] == 2): map.append("U") ifull -= 1 elif (trace[ifull, ifrag] == 3): map.append("R") ifrag -= 1 else: raise RuntimeError("Malformed matrix") while (ifull >= 0): map.append("U") ifull -= 1 while (ifrag >= 0): map.append("R") ifrag -= 1 map.reverse() return map, numpy.max(score)
################################################################################ # Classes ################################################################################
[docs]class AslLigandSearcher(object): """ Search a `Structure` instance for putative ligands with an Atom Selection Language expression. Results are returned as a list of `Ligand` instances. API example:: st = structure.Structure.read('file.mae') st_writer = structure.StructureWriter('out.mae') asl_searcher = AslLigandSearcher() ligands = asl_searcher.search(st) for lig in ligands: st_writer.append(lig.st) st_writer.close() ASL evaluates molecules in a strict sense. Ligands with zero-order bonds to metal and covalently-attached ligands are difficult to find with this naive approach. See __init__ for options that workaround these limitations. 'sidechain', 'backbone', and 'ion' aliases are used by this module. They are taken from first mmasl.ini in the path, but are assumed to be defined as a list of PDB atom names that correspond to atoms of the protein side chains, protein backbone, and small ions respectively. Since the precise definition of a ligand is context specific and impossible to generally formulate, this class attempts to provide customizable tools for identifying ligands within a structure. It is the caller's responsibility to customize the search parameters and verify that the hits are appropriate. For all keyword args, the configured value from Maestro will be used by default. Only specify a keyword arg to override this value. All kwargs (except copy_props) are passed directly to LigandParameters, as defined in the LigandParamaters class in mmasl.h :ivar copy_props: If True then copy the ct-level properties from the searched structure to all the found ligand substructures. If False, only the title will be copied. :vartype copy_props: bool :ivar min_heavy_atom_count: Minimum number of heavy atoms required in each ligand molecule. :vartype min_heavy_atom_count: int :ivar max_atom_count: Maximum number of heavy atoms for a ligand molecule (does not include hydrogens). :vartype max_atom_count: int :ivar allow_ion_only_molecules: Consider charged molecules to be ligands. :vartype allow_ion_only_molecules: bool :ivar allow_amino_acid_only_molecules: If True, consider small molecules containing only amino acids to be ligands. :vartype allow_amino_acid_only_molecules: bool :ivar excluded_residue_names: Set of PDB residue names corresponding to atoms which will never be ligands. :vartype excluded_residue_names: set[str] :ivar included_residue_names: Set of PDB residue names corresponding to atoms which always be considered ligands. :vartype included_residue_names: set[str] :see: `find_ligands` for a simple functional interface to this class. """
[docs] def __init__(self, copy_props=True, **kwargs): """ Initialize searcher. """ self.copy_props = copy_props self.override_params = {} valid_params = [ attr for attr in dir(mm.get_ligand_parameters()) if not attr.startswith('_') ] for param, value in kwargs.items(): if param not in valid_params: msg = 'Parameter {} is not a valid ligand parameter. Valid parameters are: {}' raise ValueError(msg.format(param, valid_params)) self.override_params[param] = value
""" Properties: setting LigandParameters via this interface is deprecated. """ @property def min_atom_count(self): return self._getLigandParameters().min_heavy_atom_count @min_atom_count.setter def min_atom_count(self, value): warnings.warn('min_atom_count is deprecated.', DeprecationWarning, stacklevel=2) self.override_params['min_heavy_atom_count'] = value @property def max_atom_count(self): return self._getLigandParameters().max_atom_count @max_atom_count.setter def max_atom_count(self, value): warnings.warn('max_atom_count is deprecated.', DeprecationWarning, stacklevel=2) self.override_params['max_atom_count'] = value @property def exclude_ions(self): return not self._getLigandParameters().allow_ion_only_molecules @exclude_ions.setter def exclude_ions(self, value): warnings.warn('exclude_ions is deprecated.', DeprecationWarning, stacklevel=2) self.override_params['allow_ion_only_molecules'] = not value @property def exclude_amino_acids(self): return not self._getLigandParameters().allow_amino_acid_only_molecules @exclude_amino_acids.setter def exclude_amino_acids(self, value): warnings.warn('exclude_amino_acids is deprecated.', DeprecationWarning, stacklevel=2) self.override_params['allow_amino_acid_only_molecules'] = not value @property def excluded_residues(self): return self._getLigandParameters().excluded_residue_names @excluded_residues.setter def excluded_residues(self, value): warnings.warn('excluded_residues is deprecated.', DeprecationWarning, stacklevel=2) self.override_params['excluded_residue_names'] = set(value) def _getLigandParameters(self): parameters = mm.get_ligand_parameters() for k, v in self.override_params.items(): setattr(parameters, k, v) return parameters
[docs] def search(self, st): """ Find list of putative ligands matching either `ligand_asl` or the default internally generated ASL. :type st: `Structure` :param st: Structure to search for ligands. :rtype: list :return: a list of `Ligand` instances. These are putative ligands that match the ASL expression. See `Ligand` for attributes. """ ligand_mols = mm.find_ligand_molecules(st, self._getLigandParameters()) return self._extract_ligands(st, ligand_mols)
def _extract_ligands(self, st, ligand_mol_atoms): ligands = [] # For each ligand, extract a substructure and # create a Ligand instance to contain information about it. for lig_atoms in ligand_mol_atoms: molecule_number = st.atom[lig_atoms[0]].molecule_number lig_st = st.extract(lig_atoms) if self.copy_props: lig_st.property.update(st.property) # PYTHON-1757: Generate the ASL with respect to the original # input structure: lig_asl = generate_asl(st, lig_atoms) covalent = len(lig_atoms) < len(st.molecule[molecule_number].atom) lig = Ligand(st, lig_st, molecule_number, lig_atoms, lig_asl, covalent) ligands.append(lig) lig_count = len(ligands) msg = 'AslLigandSearcher.search basic search found %d' % lig_count logger.debug(msg) return ligands
[docs]class Ligand(object): """ A putative `AslLigandSearcher` ligand structure with read-only data and convenience methods. `Ligand` items sort from smallest to largest, by total number of atoms, then by SMILES. Parameters: * complex_st: Original complex structure. * st: Ligand substructure * mol_num: Ligand molecule number in the original structure NOTE: molecule contains non-ligand atoms for covalently bound ligands. * atom_indexes: Atom indices into the original structure for this ligand. * atom_objects: List of ligand atom objects from the original structure. * lig_asl: ASL that matches the ligand atoms in the original structure. * is_covalently_bound: Whether the ligand is covalently bound. Depreacted. * pdbres: PDB residue name identifier. * centroid: Centroid of ligand as a 4-element numpy array: [x, y, z, 0.0] * unique_smiles: SMILES string representing this ligand structure. """
[docs] def __init__(self, complex_st, st, mol_num=None, atom_indexes=None, lig_asl=None, is_covalently_bound=None): """ :type st: `Structure` :param st: Original complex structure. :type st: `Structure` :param st: Ligand structure. :type mol_num: int :param mol_num: Molecular index identifier. Typically, the mol.n from the original structure from whence this ligand structure was derived. Note, depending on the nature of the ligand and the treatment of the original structure this mol.n index may not be valid. :type atom_indexes: list :param atom_indexes: Atom index identifiers. Typically, the at.n from the original structure from whence this ligand structure was derived. :type lig_asl: str :param lig_asl: ASL identifier. Typically, the expression is defined in terms of the original structure from whence this ligand structure was derived. :deprecated is_covalently_bound: Whether this ligand is bonds to other atoms (including zero-order bonds). Will be False if the ligand spans a whole molecule. """ self.complex_st = complex_st self._st = st self._mol_num = mol_num self._atom_indexes = atom_indexes self._lig_asl = lig_asl self._centroid = None self._smiles = None self._resname = None self._is_covalently_bound = is_covalently_bound return
@property def is_covalently_bound(self): """ The Ligand.is_covalently_bound property returns True if this ligand has any bonds (including zero-order) to any other atoms, and returns False if the ligand spans a complete molecule. """ return self._is_covalently_bound
[docs] def sort_key(self): """ Enable sorting for `Ligand` objects: ligands.sort(key=lambda l: l.sort_key()) Comparison criteria for sorting Ligands: total number of atoms, unique smiles string, centroid. :return: sort key :rtype: list """ sort_items = [self.st.atom_total, self.unique_smiles] sort_items.extend(self.centroid.flatten()) return sort_items
def __str__(self): """ :rtype: str :return: the unique SMILES string for this ligand. """ return self.unique_smiles # Below are a slew of functions with @property decorators that # create read-only attributes. @property def mol_num(self): """ Ligand's molecule number as defined upon instantiation. :rtype: int Warning: Depending on the nature of the ligand and the treatment of the original structure, e.g. zero-order bonds cut, this mol.n index may not be valid. """ return self._mol_num @property def atom_indexes(self): """ Indices of the Ligand atoms as defined upon instantiation. :rtype: list """ return self._atom_indexes @property def atom_objects(self): """ Atom objects from the original structure for the ligand atoms. :rtype: list """ return [self.complex_st.atom[anum] for anum in self._atom_indexes] @property def pdbres(self): """ PDB residue name identifier. If the ligand is composed of multiple residues then the names are joined with a '-' separator. :rtype: str """ if self._resname is None: resname = [] for at in self._st.atom: pdbres = at.pdbres if pdbres not in resname: resname.append(pdbres) self._resname = "-".join(resname) return self._resname @property def centroid(self): """ Centroid of the Ligand as a 4-element numpy array: [x, y, z, 0.0] :rtype: 4-element `numpy array<numpy.ndarray>` """ if self._centroid is None: self._centroid = transform.get_centroid(self._st) return self._centroid @property def unique_smiles(self): """ Unique SMILES string representing this ligand structure. :rtype: str """ if self._smiles is None: self._smiles = generate_smiles( self._st, unique=True, stereo=STEREO_FROM_ANNOTATION_AND_GEOM) return self._smiles @property def st(self): """ Copy of the ligand `Structure`. :rtype: `Structure` """ return self._st.copy() @property def ligand_asl(self): """ Ligand_asl used when searching for the ligand. The ASL defined the ligand in the context of its original structure. :rtype: str """ return self._lig_asl
[docs]class MissingLoopFinder(object): """ compare SEQRES and ATOM record and find missing loops. Does not use the order of the residues in the CT to avoid issues that occur when missing loops are searched for after some loops have been added by another program """
[docs] def __init__(self): pass
[docs] def run(self, ct, include_tails=False, legacy_output=False, debug=False): """ Compare the SEQRES records the structual atom records to find missing loops. :returns: tuples of (<residue object of the residue before the missing loop or NONE if this is a missing N-terminal tail>, <residue object of the residue after the missing loop or NONE if this is a missing C-terminal tail>, <list of residue types missing as a list of 3char strings> :param debug: is True then the allignments will be printed to stdout :type debug: bool :param include_tails: if True then missing N and C terminal tails will be included """ # Get the SEQRES data from the ct seqres_chains, seqres_sequences = read_seqres_from_ct(ct) seqres_sequences = [[ one_seq[i:i + 3] for i in range(0, len(one_seq), 4) ] for one_seq in seqres_sequences] res_for_seqres_sequences = [ [None for res in one_seq] for one_seq in seqres_sequences ] ct_chains, ct_sequences = self._getCTSequences(ct) used_ct_sequence = [False for one_seq in ct_sequences] # Start doing alignments while True: # Find the best score best_map = None for ict, (one_ct_chain, one_ct_sequence) in enumerate(zip(ct_chains, ct_sequences)): if used_ct_sequence[ict]: continue # Skip if all of the residues are nonstandard if not numpy.any([ mm.mmct_is_standard_residue(res.pdbres) for res in one_ct_sequence ]): continue one_ct_pdbname = [res.getCode() for res in one_ct_sequence] one_ct_pdbnum = [res.resnum for res in one_ct_sequence] for iseqres, (one_seqres_chain, one_seqres_sequence) in \ enumerate(zip(seqres_chains, seqres_sequences)): if one_seqres_chain != one_ct_chain: continue one_seqres_pdbname = [ RESIDUE_MAP_3_TO_1_LETTER.get(pdbres, "X") for pdbres in one_seqres_sequence ] for i in range(len(res_for_seqres_sequences[iseqres])): if res_for_seqres_sequences[iseqres][i] is not None: one_seqres_pdbname[i] = 'X' map, map_score = seq_align_match(one_seqres_pdbname, one_ct_pdbname, one_ct_pdbnum, allow_frag_gaps=True) if best_map is None or best_map[1] < map_score: best_map = (map, map_score, ict, iseqres) if best_map is None or best_map[1] <= 0: break # Mark these residues as used map, map_score, ict, iseqres = best_map used_ct_sequence[ict] = True ires = 0 iseqres_res = 0 for i in range(len(map)): if map[i] == 'M': res_for_seqres_sequences[iseqres][iseqres_res] = \ ct_sequences[ict][ires] if map[i] in ['M', 'R']: ires += 1 if map[i] in ['M', 'U']: iseqres_res += 1 missing_loops = [] for chain, sequence, residues in zip(seqres_chains, seqres_sequences, res_for_seqres_sequences): first_res_missing_loop = None missing_loop_sequence = [] for isr, (one_seq, one_res) in enumerate(zip(sequence, residues)): if one_res is None: if not missing_loop_sequence: if isr == 0: first_res_missing_loop = None else: first_res_missing_loop = residues[isr - 1] missing_loop_sequence.append(one_seq) res_name = "MISSING" else: if missing_loop_sequence: missing_loops.append((first_res_missing_loop, one_res, missing_loop_sequence)) missing_loop_sequence = [] res_name = "%s:%d%s(%s)" % (one_res.chain, one_res.resnum, one_res.inscode, one_res.pdbres) if debug: print("%s: %3s: %s" % (chain, one_seq, res_name)) if missing_loop_sequence: missing_loops.append( (first_res_missing_loop, None, missing_loop_sequence)) if not include_tails: missing_loops = [(beg_res, end_res, seq) for (beg_res, end_res, seq) in missing_loops if (beg_res is not None and end_res is not None)] if legacy_output: return ["%s-%s" % (beg, end) for (beg, end, seq) in missing_loops] else: return missing_loops
def _getCTSequences(self, ct): """ Get the residues sequenecs from the structure input ct{Schrodinger.Structure} input structure Output is a tuple of lists. The first list is the chainID for each of the chains in the molecule as a list of 1-character strings. The second is a list of lists with with the 3-letter code for each residue in the sequence of each chain. All chains are seperated wherever a chain break occurs even if both sets of resideus are on the same chain """ # Read in all of the sequence fragments from the structure ct_sequences = [] one_chain = None one_sequence = [] last_res = None for chain in ct.chain: for res in get_residues_by_connectivity(chain): # Deal with disconnected density if (res.getAlphaCarbon() is None and res.getCarbonylCarbon() is None and res.getBackboneNitrogen() is None): continue if (one_chain is None or last_res is None or one_chain != res.chain or not self._residuesAreConnected(last_res, res)): if one_sequence is not None and one_chain is not None: ct_sequences.append(one_sequence) one_chain = None one_sequence = [] one_chain = res.chain one_sequence.append(res) last_res = res if one_sequence is not None and one_chain is not None: ct_sequences.append(one_sequence) # Sort the chains by the chain name and residue number ct_sequences.sort(key=lambda x: (x[0].chain, x[0].resnum, x[0].inscode)) # Pull the chain for each sequence ct_chains = [one_sequence[0].chain for one_sequence in ct_sequences] return ct_chains, ct_sequences def _residuesAreConnected(self, res1, res2): """ Return True if there are any bonds connecting res1 and res2, False otherwise inputs res1{Schrodinger.Structure._Resdiue} Input residue 1 res2{Schrodinger.Structure._Resdiue} Input residue 2 """ if res1.structure != res2.structure: return False res2_atoms = set(res2.getAtomIndices()) res1_bonded_atoms = set() for atom in res1.atom: for bonded_atom in atom.bonded_atoms: if int(bonded_atom) in res2_atoms: return True return False
[docs]def get_low_energy_reps(sts, eps, key=None): """ Cluster the given structures by energy using the given energy key function and precision and return the lowest energy structures from each cluster sorted by energy. :type sts: list[`schrodinger.structure.Structure`] :param sts: the structures :type eps: float :param eps: the precision that controls the size of the clusters, see sklearn.cluster.DBSCAN documentation for more details :type key: function or None :param key: the function to get the energy from the structure, if None minimize.compute_energy will be used :raise ValueError: if there is an issue :rtype: list[`schrodinger.structure.Structure`] :return: representative structures sorted by increasing energy """ if not key: key = lambda x: minimize.compute_energy(x) # in case the energy function is expensive cache it tmp_energy = 'r_user_tmp_energy' for st in sts: if tmp_energy in st.property: # just in case if for some reason the caller has 'r_user_tmp_energy' # defined for a different purpose raise ValueError(f'The {tmp_energy} is already defined.') st.property[tmp_energy] = key(st) tmp_key = lambda x: x.property[tmp_energy] groups = mathutils.dbscan_cluster(sts, eps, key=tmp_key) sts = [min(group, key=tmp_key) for group in groups.values()] sts = sorted(sts, key=tmp_key) for st in sts: st.property.pop(tmp_energy) return sts