Source code for schrodinger.application.matsci.cg_mapping

"""
Utilities for mapping CG structures to AA

Copyright Schrodinger, LLC. All rights reserved.
"""

import itertools
from collections import OrderedDict
from collections import defaultdict
from collections import namedtuple

import networkx
import numpy

from schrodinger import structure
from schrodinger.application.matsci import clusterstruct
from schrodinger.application.matsci import coarsegrain
from schrodinger.application.matsci import msutils
from schrodinger.application.matsci import msprops
from schrodinger.application.matsci import smartsutils
from schrodinger.application.matsci.nano import xtal
from schrodinger.infra import mm
from schrodinger.structutils import analyze
from schrodinger.structutils import transform

# just create 130 to avoid potential future issues
MOLECULE_COLORS = 10 * [
    (255, 128, 0),  # orange
    (0, 0, 255),  # blue
    (255, 255, 0),  # yellow
    (0, 255, 0),  # green
    (0, 255, 255),  # aqua
    (127, 0, 255),  # purple
    (255, 0, 255),  # magenta
    (153, 76, 0),  # brown
    (128, 128, 128),  # gray
    (255, 0, 127),  # pink
    (76, 153, 0),  # dark green
    (0, 76, 153),  # royal blue
    (255, 255, 255)  # white
]

# unique bond keys
SMARTS_UNIQUE_BOND_LABEL = 'smarts'
SMARTS_UNIQUE_BOND_KEY = smartsutils.GROUP_ATOM_INDEXES_PROP
POLYMER_UNIQUE_BOND_LABEL = 'polymer'
POLYMER_UNIQUE_BOND_KEY = msprops.MONOMER_ORIG_ATOM_IDX_PROP
RESIDUE_UNIQUE_BOND_LABEL = 'residue'
RESIDUE_UNIQUE_BOND_KEY = msprops.PDB_ATOM_NAME_KEY
UNIQUE_BOND_DICT = OrderedDict([
    (SMARTS_UNIQUE_BOND_LABEL, SMARTS_UNIQUE_BOND_KEY),
    (POLYMER_UNIQUE_BOND_LABEL, POLYMER_UNIQUE_BOND_KEY),
    (RESIDUE_UNIQUE_BOND_LABEL, RESIDUE_UNIQUE_BOND_KEY)
])

# chiral keys, CHIRAL_SYM_DICT maps pair tuples of bonded
# coarse grain particle chiralities according to symmetry
# equivalence, for example ('S', 'R') maps to ('R', 'S')
# and ('S', 'S') maps to ('R', 'R'), any tuple containing
# an achiral particle, i.e. 'A', maps to a pair of achiral
# particles, for example ('R', 'A') maps to ('A', 'A')
CHIRAL_PROP_DICT = {
    msprops.CHIRAL_R_MONOMER_PROP: msprops.CHIRAL_R,
    msprops.CHIRAL_S_MONOMER_PROP: msprops.CHIRAL_S
}
CHIRAL_A = 'A'
get_chiral_hash = lambda x: x if CHIRAL_A not in x else 2 * (CHIRAL_A,)
get_chiral_pair = lambda x: (x, tuple(sorted(get_chiral_hash(x))))
ARS = CHIRAL_A + msprops.CHIRAL_R + msprops.CHIRAL_S
CHIRAL_SYM_DICT = dict(map(get_chiral_pair, itertools.product(ARS, repeat=2)))
CHIRAL_RR = 2 * (msprops.CHIRAL_R,)
CHIRAL_SS = 2 * (msprops.CHIRAL_S,)
CHIRAL_SYM_DICT[CHIRAL_SS] = CHIRAL_RR
CHIRAL_SEPARATOR = coarsegrain.CHIRAL_SEPARATOR

RESIDUE_NAME_KEY = 's_m_pdb_residue_name'
RESIDUE_NUMBER_KEY = 'i_m_residue_number'
DEFINITION_NAME_KEYS = [smartsutils.GROUP_NAME_PROP, \
    smartsutils.GROUP_NAMES_PROP, msprops.MONOMER_NAME_PROP, RESIDUE_NAME_KEY]
DEFINITION_NUMBER_KEYS = [smartsutils.GROUP_NUMBER_PROP, \
    smartsutils.GROUP_NUMBERS_PROP, RESIDUE_NUMBER_KEY, RESIDUE_NUMBER_KEY]
DEFINITION_KEYS = list(zip(DEFINITION_NAME_KEYS, DEFINITION_NUMBER_KEYS))

PDB_ATOM_NAME_KEY = msprops.PDB_ATOM_NAME_KEY
POLYMER_BACKBONE_TAG = '-backbone'
POLYMER_BACKBONE_AND_ADJOINING_TAG = '-backbone-and-adjoining'
POLYMER_SIDEGROUP_TAG = '-sidegroup-'
PROTEIN_BACKBONE_TAG = '-backbone'
PROTEIN_SIDECHAIN_TAG = '-sidechain'
GENERAL_TAG = 'molecule-'

# PRO is allowed to have a side-chain
RESIDUES_NO_SIDECHAIN = ['ACE', 'GLY', 'NMA']
BACKBONE_PDB_ATOM_NAMES = ['H', 'N', 'CA', 'HA', 'C', 'O']

NonContiguousBondingMsg = ('Particles containing atoms that are '
                           'not contiguously bonded are currently not '
                           'supported.')

CG_BOND_ORDER = 1
CG_PARTICLE_INDICES = 's_matsci_cg_particle_indices'
CG_TAG = '_cg'

# CG model will inherit the following properties from
# its parent structure
INHERITABLE_PROPERTIES = [
    xtal.Crystal.SPACE_GROUP_KEY, xtal.Crystal.A_KEY, xtal.Crystal.B_KEY,
    xtal.Crystal.C_KEY, xtal.Crystal.ALPHA_KEY, xtal.Crystal.BETA_KEY,
    xtal.Crystal.GAMMA_KEY, xtal.Crystal.CHORUS_BOX_AX_KEY,
    xtal.Crystal.CHORUS_BOX_AY_KEY, xtal.Crystal.CHORUS_BOX_AZ_KEY,
    xtal.Crystal.CHORUS_BOX_BX_KEY, xtal.Crystal.CHORUS_BOX_BY_KEY,
    xtal.Crystal.CHORUS_BOX_BZ_KEY, xtal.Crystal.CHORUS_BOX_CX_KEY,
    xtal.Crystal.CHORUS_BOX_CY_KEY, xtal.Crystal.CHORUS_BOX_CZ_KEY
]

# prop_name is the atom property name associated with the atom definition, idx
# is the index of the sought definition (used for sorting available
# definitions), name is the name of the definition, and numberstr is the molecule
# number (n), chain name (c), and match index (m) combined as 'n-c-m'
# for example ('s_m_pdb_residue_name', 2, GLY, 4-B-7) means that this atom
# has with residue name as (s_m_pdb_residue_name) GLY matches the 3rd sought
# definition (0 indexed) and is in the 4th molecule, chain B, and 7th of such
# residues
ATOM_DEFINITION = namedtuple('ATOM_DEFINITION',
                             ['prop_name', 'idx', 'name', 'numberstr'])


[docs]def get_cg_particle_indices(atom): """ Return a list of CG particle indices for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: CG particle indices """ tokens = atom.property.get(CG_PARTICLE_INDICES) if tokens: return list(map(int, tokens.split(smartsutils.ATOM_PROP_DELIM))) return []
[docs]class NonContiguousBondingError(Exception): """ Exception to handle non contiguous bonding in CG """
[docs] def __init__(self, msg=NonContiguousBondingMsg, *args, **kwargs): """ Exception with default error message """ super().__init__(msg, *args, **kwargs)
[docs]def extract_and_contract(struct, idxs=None, pbc=None, fast=False): """ From the given indices and structure extract a structure and contract it according to the PBC. :type struct: structure.Structure :param struct: the structure from which to extract :type idxs: set or None :param idxs: a set of integer atom indices defining the structure to be extracted or None in which case all indices will be used :type pbc: `schrodinger.infra.structure.PBC` :param pbc: A pre-computed PBC for the structure :type fast: bool :param fast: True if the computation should be done on a throw-away structure that will not be used for more than a center of mass calculation, False if the extracted structure must preserve properties such as atom formal charge and structure properties. :rtype: structure.Structure :return: the extracted and contracted structure """ # no need to pass in anchors to clusterstruct.contract_by_molecule # as it is guaranteed that at least a single atom will not move # and therefore that the contraction doesn't suffer from drift, # it is possible that the stationary atom is always the first atom if idxs is None: idxs = set(range(1, struct.atom_total + 1)) if fast: # extract is a very expensive action if the parent structure is large, # so just creating a new structure with atoms at the same positions is # vastly faster sub_st = structure.create_new_structure() for index in idxs: atom = struct.atom[index] # Creating as 'C' avoids mmat warnings for some elements newat = sub_st.addAtom('C', *atom.xyz) newat.element = atom.element # see MATSCI-10422 - where parent atom mass depends # on this property numbers = atom.property.get(smartsutils.GROUP_NUMBERS_PROP) if numbers: newat.property[smartsutils.GROUP_NUMBERS_PROP] = numbers # We have to bond all atoms in the molecule so that it is a single # molecule, however, the actualy bonding *does not matter* as long as # there is a continuous bond path connecting all atoms, so just fake # it rather than try to reproduce the original bonding for index in range(1, sub_st.atom_total): sub_st.addBond(index, index + 1, 1) else: sub_st = struct.extract(idxs, copy_props=True) synced_st = xtal.sync_pbc(sub_st) if synced_st: sub_st = synced_st if not pbc: try: pbc = clusterstruct.create_pbc(sub_st) except ValueError: pass if pbc: clusterstruct.contract_by_molecule(sub_st, pbc=pbc) return sub_st
[docs]def get_center_of_mass_weight(atom): """ Return the weight of the atom used for the center of mass calculation. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: float :return: the weight of the atom in amu """ numbers = smartsutils.get_group_numbers(atom) if numbers: n_particles_w_atom = len(numbers) else: n_particles_w_atom = 1 return 20 * numpy.tanh(atom.atomic_weight / 20) / n_particles_w_atom
[docs]def center_of_mass(struct, idxs=None, pbc=None): """ Return the center of mass of the given indices in the given structure accounting for if the structure has a PBC. :type struct: structure.Structure :param struct: the structure containing the atoms for which the center of mass is needed :type idxs: set or None :param idxs: a set of integer atom indices for which the center of mass is needed or None in which case all atoms will be used :type pbc: `schrodinger.infra.structure.PBC` :param pbc: A pre-computed PBC for the structure :rtype: numpy.array :return: the center of mass """ if idxs is None: idxs = set(range(1, struct.atom_total + 1)) sub_st = extract_and_contract(struct, idxs=idxs, pbc=pbc, fast=True) xyzs = sub_st.getXYZ() weights = [get_center_of_mass_weight(atom) for atom in sub_st.atom] return numpy.average(xyzs, weights=weights, axis=0)
[docs]class Particle(object): """ Create a coarse grain particle. """
[docs] def __init__(self): """ Create an instance. """ self.xyz = None self.vdw_radius = None self.atomic_weight = None self.formal_charge = None self.partial_charge = None self.formula = None self.name = None self.rgb_color = None self.atom_type = None self.key = None self.parent_structure = None self.contracted_parent_structure = None self.parent_structure_str = None self.parent_indices = None self.parent_cut_bonds = None
[docs] def setXYZ(self, xyz=None, astructure=None, pbc=None): """ Set the position of the given particle. :type xyz: numpy.array or None :param xyz: the position of the particle or None if there isn't one :type astructure: structure.Structure or None :param astructure: the structure from which the parent structure will be extracted and for which the center of mass is needed or None if there isn't one :type pbc: `schrodinger.infra.structure.PBC` :param pbc: A pre-computed PBC for the structure """ if xyz is not None: self.xyz = xyz elif astructure is not None: self.xyz = center_of_mass(astructure, idxs=self.parent_indices, pbc=pbc)
[docs] def setVDWRadius(self, vdw_radius=None): """ Set the VDW radius in Ang. :type vdw_radius: float or None :param vdw_radius: the VDW radius of the given particle in Ang. or None if there isn't one """ if vdw_radius is not None: self.vdw_radius = vdw_radius else: max_distance = 0.0 for atom in self.contracted_parent_structure.atom: distance = transform.get_vector_magnitude( numpy.array(atom.xyz) - self.xyz) distance += atom.radius if distance > max_distance: max_distance = distance self.vdw_radius = max_distance
[docs] def setAtomicWeight(self, atomic_weight=None): """ Set the atomic weight in g/mol. :type atomic_weight: float or None :param atomic_weight: the atomic weight of the given particle in g/mol or None if there isn't one """ if atomic_weight is not None: self.atomic_weight = atomic_weight else: # self.parent_structure can be a fragment and so its # .total_weight property includes implicit hydrogens # which we don't want so compute it by hand self.atomic_weight = \ sum([mm.mmelement_get_atomic_weight_by_atomic_number(atom.atomic_number) for atom in self.parent_structure.atom])
[docs] def setFormalCharge(self, formal_charge=None): """ Set the formal charge. :type formal_charge: int :param formal_charge: the formal charge of the given particle or None if there isn't one """ if formal_charge is not None: self.formal_charge = formal_charge else: self.formal_charge = self.parent_structure.formal_charge
[docs] def setPartialCharge(self, partial_charge=None): """ Set the partial charge. :type partial_charge: float :param partial_charge: the partial charge of the given particle or None if there isn't one """ if partial_charge is not None: self.partial_charge = partial_charge else: self.partial_charge = \ sum([atom.partial_charge for atom in self.parent_structure.atom])
[docs] def setParentStructure(self, astructure): """ Set the parent structure. :type astructure: structure.Structure :param astructure: the structure from which the parent structure will be extracted """ self.parent_structure = \ astructure.extract(self.parent_indices, copy_props=True) self.contracted_parent_structure = extract_and_contract( astructure, idxs=self.parent_indices)
[docs] def setParentStructureStr(self): """ Set the parent structure string. """ self.parent_structure_str = coarsegrain.get_base64_str_from_structure( self.parent_structure)
[docs] def setParentIndices(self, parent_indices): """ Set the parent indices. :type parent_indices: list :param parent_indices: the parent indices, i.e. indices of the parent atomic structure for this particle """ self.parent_indices = parent_indices
[docs] def setParentCutBonds(self, astructure): """ Set the parent cut bonds. :type astructure: structure.Structure :param astructure: the structure from which the parent structure was extracted and thus for which the bonding information is needed """ # this list contains pair tuples for bonds that crossed the # particle boundary in the parent structure, the first atom # is the particle atom, the second atom is the atom that is # outside the particle self.parent_cut_bonds = [] for aidx in self.parent_indices: aatom = astructure.atom[aidx] for batom in aatom.bonded_atoms: bidx = batom.index if bidx not in self.parent_indices: order = astructure.getBond(aidx, bidx).order self.parent_cut_bonds.append((aidx, bidx, order))
[docs] def setFormula(self, formula=None): """ Set the formula of the given particle. :type formula: str or None :param formula: the formula for the given particle or None if there isn't one """ if formula is not None: self.formula = formula else: self.formula = analyze.generate_molecular_formula( self.parent_structure)
[docs] def setName(self, name): """ Set the name of the given particle. :type name: str :param name: name of the particle """ self.name = name
[docs] def setRGBColor(self, rgb_color): """ Set the RGB color of the given particle. :type rgb_color: tuple :param rgb_color: a triple of integers in [0, 255] that give an RGB color for the given particle """ self.rgb_color = rgb_color
[docs] def setAtomType(self, atom_type): """ Set the atom type of the given particle. :type atom_type: int :param atom_type: atom type of the given particle """ self.atom_type = atom_type
[docs] def setKey(self, key): """ Set the key of the given particle, this is the key that indicates the parent structure string. :type key: str :param key: key of the given particle """ self.key = key
[docs] def defineAtom(self, astructure, index): """ Define the given atom. :type astructure: structure.Structure :param astructure: the structure containing the atom to define :type index: int :param index: index of the atom to define """ # note that mmct_atom_set_coarse_grain_particle must be called # after setting the atom.atom_type or warnings regarding # the calling of mmct_atom_set_isotope and mmct_atom_set_atomic_number # on atoms marked as coarse grain particles will be raised, also # mmct_atom_set_coarse_grain_particle must be called before # mmct_atom_set_coarse_grain_mass atom = astructure.atom[index] atom.x = self.xyz[0] atom.y = self.xyz[1] atom.z = self.xyz[2] coarsegrain.set_atom_coarse_grain_properties( astructure, atom, self.name, rgb=self.rgb_color, atom_type=self.atom_type, formal_charge=self.formal_charge, partial_charge=self.partial_charge, radius=self.vdw_radius, mass=self.atomic_weight) if self.key: atom.property[coarsegrain.CG_PARTICLE_KEY_KEY] = self.key if self.parent_indices: atom.property[coarsegrain.CG_PARTICLE_PARENT_INDICES_KEY] = \ str(tuple(self.parent_indices)) if self.parent_cut_bonds: atom.property[coarsegrain.CG_PARTICLE_PARENT_BONDS_KEY] = \ str(tuple(self.parent_cut_bonds))
[docs] @staticmethod def getParentIndices(cg_structure, cg_idx): """ Return the parent indices for the given coarse grain particle index in the given coarse grain structure. :type cg_structure: structure.Structure :param cg_structure: the coarse grain structure :type cg_idx: int :param cg_idx: the coarse grain particle index :rtype: tuple :return: the parent indices for the given coarse grain particle index """ idxs = cg_structure.atom[cg_idx].property[ coarsegrain.CG_PARTICLE_PARENT_INDICES_KEY] return eval(idxs)
[docs] @staticmethod def getParentCutBonds(cg_structure, cg_idx, reverse=False): """ Return the parent cut bonds for the given coarse grain particle index in the given coarse grain structure. :type cg_structure: structure.Structure :param cg_structure: the coarse grain structure :type cg_idx: int :param cg_idx: the coarse grain particle index :type reverse: bool :param reverse: by default the first atom index of a pair from the returned tuple of pairs is for the particle with the given index and the second atom index of the pair is for the outside particle, if reverse is True it reverses this ordering :rtype: tuple :return: the parent cut bonds for the given coarse grain particle index """ idxs = cg_structure.atom[cg_idx].property[ coarsegrain.CG_PARTICLE_PARENT_BONDS_KEY] idxs = eval(idxs) if reverse: idxs = tuple([(j, i, k) for i, j, k in idxs]) return idxs
[docs] @staticmethod def getParentBonds(cg_structure, cg_idx_i, cg_idx_j): """ Return the parent bonds between the given coarse grain particles indices in the given coarse grain structure. :type cg_structure: structure.Structure :param cg_structure: the coarse grain structure :type cg_idx_i: int :param cg_idx_i: the first coarse grain particle index :type cg_idx_j: int :param cg_idx_j: the second coarse grain particle index :rtype: tuple :return: the parent bonds between the given coarse grain particle indices """ bonds = [] for bond in Particle.getParentCutBonds(cg_structure, cg_idx_i): if bond[1] in Particle.getParentIndices(cg_structure, cg_idx_j): bonds.append(bond) return tuple(bonds)
[docs]class CoarseGrainBond(object): """ Manage a coarse grain bond. """ FORMAT_START = '(' FORMAT_END = ')' DUMMY_ELEMENT = 'DU'
[docs] def __init__(self, name_1, idx_1, name_2, idx_2): """ Create an instance. :type name_1: str :param name_1: the name of particle 1 :type idx_1: int :param idx_1: the index particle 1 :type name_2: str :param name_2: the name of particle 2 :type idx_2: int :param idx_2: the index particle 2 """ self.name_1 = name_1 self.idx_1 = idx_1 self.name_2 = name_2 self.idx_2 = idx_2 self.parent_indices_1 = None self.parent_cut_bonds_1 = None self.parent_indices_2 = None self.parent_cut_bonds_2 = None self.parent_bonds = None self.smiles = None self.label_1 = None self.label_2 = None self.chiral_hash = None self.parent_bonds_hash_info = None self.parent_bonds_hash = None
[docs] def setParentData(self, cg_structure): """ Set the parent data. :type cg_structure: structure.Structure :param cg_structure: the CG model """ pair = (self.idx_1, self.idx_2) inds = [Particle.getParentIndices(cg_structure, x) for x in pair] self.parent_indices_1, self.parent_indices_2 = inds cuts = [Particle.getParentCutBonds(cg_structure, x) for x in pair] self.parent_cut_bonds_1, self.parent_cut_bonds_2 = cuts self.parent_bonds = Particle.getParentBonds(cg_structure, *pair)
[docs] def getAllParentIndices(self): """ Return a tuple containing all parent indices. :rtype: tuple :return: contains all parent indices """ return tuple(set(self.parent_indices_1 + self.parent_indices_2))
[docs] def setSmiles(self, st): """ Set a unique SMILES for the parent indices. :type st: structure.Structure :param st: the parent model """ # get all parent indices, this includes the parent atom indices # of each particle as well as the indices of any atoms bonded # to the parent atoms all_parent_indices = self.getAllParentIndices() out_idxs = set() for atuple in self.parent_cut_bonds_1 + self.parent_cut_bonds_2: in_idx, out_idx, order = atuple if out_idx not in all_parent_indices: out_idxs.add(out_idx) all_parent_indices += tuple(out_idxs) all_parent_indices = sorted(all_parent_indices) # extract structure parent_st = st.extract(all_parent_indices) # see MATSCI-3619, handle dangling bonds, we have a fragment # with dangling bonds, if we remove the dangling bonds then # the smiles generation will necessarily create implicit # hydrogens, we will prevent that by keeping the dangling # atoms but modifying them to be dummy atoms, i.e. elemental # symbol of DU, these dummy atoms show as wildcards (*) in # the patterns, note these patterns although obtained with # analyze.generate_smiles are actually smarts patterns as # they are matching and can not generate structures alone, # for example as with structure.SmilesReader, for example the # smarts patterns for the three bond types in standard P3HT # (polythiophene) polymer look like: # # (1) [*]c(c1)sc(c1[*])-c(c2[*])sc([*])c2 # (2) [*]c1c([*])sc(c1)-c(c2)sc([*])c2[*] # (3) [*]c1c([*])sc(c1)-c(c2[*])sc([*])c2 # amap = dict( zip(all_parent_indices, list(range(1, len(all_parent_indices) + 1)))) for old in out_idxs: new = amap[old] aatom = parent_st.atom[new] aatom.element = self.DUMMY_ELEMENT # get smiles self.smiles = analyze.generate_smiles( parent_st, unique=True, stereo=analyze.STEREO_FROM_ANNOTATION_AND_GEOM)
[docs] def setDefaultParticleLabels(self, st): """ Set the default particle labels for the two coarse grain particles. :type st: structure.Structure :param st: the parent model """ inside_idx_type_dict, outside_idx_type_dict = map( dict, zip(*self.parent_bonds_hash_info)) element_dicts = [] for idx_type_dict in (inside_idx_type_dict, outside_idx_type_dict): element_dict = {} for idx in idx_type_dict: element_dict.setdefault(st.atom[idx].element, []).append(idx) element_dicts.append(element_dict) inside_element_dict, outside_element_dict = element_dicts inside_chiral_sym, outside_chiral_sym = list(self.chiral_hash) inside_items = (inside_element_dict, inside_chiral_sym, inside_idx_type_dict) outside_items = (outside_element_dict, outside_chiral_sym, outside_idx_type_dict) labels = [] for element_dict, chiral_sym, type_dict in (inside_items, outside_items): label = '' for element in sorted(element_dict): for idx in element_dict[element]: type_label, type_value = type_dict[idx] if type_label == RESIDUE_UNIQUE_BOND_LABEL: type_value = '' label += element + type_value if chiral_sym != CHIRAL_A: label += CHIRAL_SEPARATOR + chiral_sym labels.append(label) self.label_1, self.label_2 = labels
[docs] def setChiralHash(self, st): """ Set the chiral hash. :type st: structure.Structure :param st: the parent model """ # only the first parent bond needs to be considered, # if the CG particle types for this bond are equivalent # then sort the hash to avoid uniqueifying bonds with # different atom orderings chiral_hash = [] pair = [] if self.parent_bonds: pair = self.parent_bonds[0][:-1] if self.name_1 == self.name_2: pair = sorted(pair) for parent_idx in pair: parent_atom = st.atom[parent_idx] for prop, label in CHIRAL_PROP_DICT.items(): value = parent_atom.property.get(prop, None) if value: chiral_hash.append(label) break else: chiral_hash.append(CHIRAL_A) self.chiral_hash = CHIRAL_SYM_DICT.get(tuple(chiral_hash))
[docs] def setParentBondsHash(self, st): """ Set the parent bonds hash. :type st: structure.Structure :param st: the parent model """ # categorize using UNIQUE_BOND_DICT, hashes are just some # hash of data from the original repeat unit, if the CG particle # types for this bond are equivalent then sort the hash # to avoid uniqueifying bonds with different atom orderings parent_bonds_hash_info = [] for parent_bond in self.parent_bonds: parent_bond_hash = [] pair = parent_bond[:-1] if self.name_1 == self.name_2: pair = sorted(pair) for parent_idx in pair: parent_atom = st.atom[parent_idx] for label, key in UNIQUE_BOND_DICT.items(): if key == SMARTS_UNIQUE_BOND_KEY: idxs = smartsutils.get_group_atom_indices(parent_atom) value = smartsutils.ATOM_PROP_DELIM.join(map(str, idxs)) else: value = str(parent_atom.property.get(key, '')).strip() if value: parent_bond_hash.append((parent_idx, (label, value))) break else: return parent_bonds_hash_info.append(tuple(parent_bond_hash)) self.parent_bonds_hash_info = tuple(parent_bonds_hash_info) bond_hashes = [] for pair in self.parent_bonds_hash_info: bond_hash = '_'.join(['-'.join(atuple[1]) for atuple in pair]) bond_hashes.append(bond_hash) self.parent_bonds_hash = ':'.join(bond_hashes)
[docs] @staticmethod def formatParticleLabel(label): """ Format the particle label. :type label: str :param label: the label to format :rtype: str :return: the formated label """ return ''.join( [CoarseGrainBond.FORMAT_START, label, CoarseGrainBond.FORMAT_END])
[docs]class CGMapperMixin(object): """ A common mixin class for mapping all-atoms molecules to coarse-grain """ def _filterExistingDefinitions(self): """ Even atom property access gets expensive over huge atomistic systems for CG simulations, so this prefilters the (name, number) tuples to get rid of any tuples where the name property does not appear on any atom. The remaining (name, number) tuples will need to be searched for on every atom. :rtype: list :return: Each item of the list is a (name, number) tuple from DEFINITION_KEYS. Only those tuples with name properties that actually exist as atom properties on the structure will be included. """ existing_definitions = [] if not self.astructure.atom: return existing_definitions for name_key, num_key in DEFINITION_KEYS: keep = False if name_key == RESIDUE_NAME_KEY: # Built-in atom properties like RESIDUE_NAME_KEY are treated # differently and can't be accessed via mmct_atom_property_get # functions. But we know this property will be there. keep = True else: try: mm.mmct_atom_property_get_string(self.astructure, name_key, 1) except mm.MmException as mmexc: if mmexc.rc == mm.MMCT_ATOM_PROPERTY_NOT_DEFINED_IN_CT: # Property does not exist on any atom keep = False elif mmexc.rc == mm.MMCT_ATOM_PROPERTY_UNDEFINED_ATOM: # Property exists on at least one atom, but not this one keep = True else: # An unexpected error occurred, don't be silent about it raise else: # The property exists on the atom we checked keep = True if keep: existing_definitions.append((name_key, num_key)) return existing_definitions
[docs] def getAtomPolymerNames(self, atom, name): """ Get the polymer names for this atom. :type atom: structure._StructureAtom :param atom: the atom for which the names are needed :type name: str :param name: polymer monomer name :rtype: list :return: polymer names """ # add all polymer backbone atoms to both the backbone and backbone # and adjoining names and then delete the latter name later if there # were never any adjoining atoms added names = [] backbone_atom = atom.property.get(msprops.BACKBONE_ATOM_PROP, None) if backbone_atom: names.append(name + POLYMER_BACKBONE_TAG) names.append(name + POLYMER_BACKBONE_AND_ADJOINING_TAG) adjoining_atom = \ atom.property.get(msprops.BACKBONE_ADJOINING_ATOM_PROP, None) if adjoining_atom: if atom.element == 'H': names.append(name + POLYMER_BACKBONE_TAG) names.append(name + POLYMER_BACKBONE_AND_ADJOINING_TAG) side_group_name = \ atom.property.get(msprops.SIDE_GROUP_ATOM_PROP, None) if side_group_name: names.append(name + POLYMER_SIDEGROUP_TAG + side_group_name) return names
[docs] def getAtomResidueNames(self, atom, name): """ Get the residue names for this atom. :type atom: structure._StructureAtom :param atom: the atom for which the names are needed :type name: str :param name: residue name :rtype: list :return: residue names """ names = [] pdb_name = atom.property.get(PDB_ATOM_NAME_KEY, None) if pdb_name is None: return names pdb_name = pdb_name.strip() if not pdb_name: return names if pdb_name in BACKBONE_PDB_ATOM_NAMES: names.append(name + PROTEIN_BACKBONE_TAG) else: names.append(name + PROTEIN_SIDECHAIN_TAG) return names
[docs] def getAtomDefinitions(self, atom, existing_definitions): """ Get the definitions for this atom. :type atom: structure._StructureAtom :param atom: the atom for which definition data is needed :type existing_definitions: list :param existing_definitions: Each item of the list is a (name, number) tuple from DEFINITION_KEYS that should be searched on the atom. :rtype: list :return: list of ATOM_DEFINITION for the passed atom """ idxs_names_numberstrs = [] chain = str(atom.chain).strip() if not chain: chain = '_' molnum = str(atom.molecule_number) for idx, (name_key, number_key) in enumerate(existing_definitions): # see MATSCI-4079 if name_key in [ smartsutils.GROUP_NAME_PROP, smartsutils.GROUP_NAMES_PROP ]: tmpchain = '_' names = smartsutils.get_group_names(atom) numbers = smartsutils.get_group_numbers(atom) if not names or not numbers: continue else: tmpchain = chain try: names = [atom.property[name_key]] numbers = [atom.property[number_key]] except KeyError: continue for name, number in zip(names, numbers): name = name.strip() numberstr = '-'.join([molnum, tmpchain, str(number)]) if not name: name = GENERAL_TAG + numberstr idxs_names_numberstrs.append( ATOM_DEFINITION(name_key, idx, name, numberstr)) if name_key == msprops.MONOMER_NAME_PROP: for aname in self.getAtomPolymerNames(atom, name): idxs_names_numberstrs.append( ATOM_DEFINITION(name_key, idx, aname, numberstr)) elif name_key == RESIDUE_NAME_KEY and name not in RESIDUES_NO_SIDECHAIN: for aname in self.getAtomResidueNames(atom, name): idxs_names_numberstrs.append( ATOM_DEFINITION(name_key, idx, aname, numberstr)) return idxs_names_numberstrs
[docs] def trimDefinitionDict(self, definition_dict): """ Trim the given definition dictionary to get rid of redundant definitions. :type definition_dict: OrderedDict :param definition_dict: keys are particle type strings, values are tuples of tuples of integers of particle occurrences """ # if a polymer backbone and adjoining definition contains only backbone # atoms or if a polymer backbone definition contains only monomer atoms # then delete the former definitions as in these cases they are equivalent # to the latter definitions, the following tags is a list of pair tuples # where the first element of each tuple should be checked for redundancy with # the second element keys_to_rm = [] tags = [(POLYMER_BACKBONE_AND_ADJOINING_TAG, POLYMER_BACKBONE_TAG), (POLYMER_BACKBONE_TAG, '')] for key in list(definition_dict): for tag1, tag2 in tags: if key.endswith(tag1): name = key.split(tag1)[0] if definition_dict[key] == definition_dict[name + tag2]: keys_to_rm.append(key) for key in keys_to_rm: definition_dict.pop(key)
[docs] @staticmethod def getUniqueDefNames(atom_definitions): """ Gets the dictionary of names where the a single atom definition name is being used by multiple properties :param atom_definitions: List of all atom definitions :type atom_definitions: list(AtomDefinition) :returns: The dictionaries have keys as the definition names and the values of first dict is the property used by the definition and second dictionary values are list of properties ignored by the definition :rtype: dict, dict """ prop_name_keys = defaultdict(set) for atm_def in atom_definitions: prop_name_keys[atm_def.name].add(atm_def.prop_name) used_props, ignored_props = {}, {} for name, prop_names in prop_name_keys.items(): # Check if any property names need to be removed if len(prop_names) == 1: used_props[name] = list(prop_names)[0] ignored_props[name] = [] continue # Always prefer SMARTS properties prop_names = sorted(list(prop_names)) for smart_prop in [ smartsutils.GROUP_NAME_PROP, smartsutils.GROUP_NAMES_PROP ]: prop_names.sort(key=smart_prop.__eq__) # Remove the a key to use and save other keys as ignored keys used_props[name] = prop_names.pop(-1) ignored_props[name] = prop_names return used_props, ignored_props
[docs] def getDefinitionDict(self, warn_logger=None): """ Get definition dict. :type warn_logger: function :param warn_logger: If passed the warnings generated during generation of definitions will be passed to this function as a string :rtype: OrderedDict :return: keys are particle type strings, values are tuples of tuples of integers of particle occurrences """ existing_definitions = self._filterExistingDefinitions() # build a dictionary of available definitions atom-wise atom_definitions = {} for atom in self.astructure.atom: atom_definitions[atom.index] = self.getAtomDefinitions( atom, existing_definitions) # Find the keys that are duplicate used_prop_names, ignored_prop_names = self.getUniqueDefNames( msutils.flatten(atom_definitions.values())) if warn_logger: ignore_str = '' for name, values in ignored_prop_names.items(): if not values: continue used_prop = used_prop_names[name] ignore_str += ( f"Found multiple definitions for {name} using {used_prop} " f"and ignoring {','.join(values)}.\n") if ignore_str: warn_logger(ignore_str) # Create unsorted definition dictionary definition_dict = defaultdict( lambda: defaultdict(lambda: defaultdict(list))) for aid, adefinitions in atom_definitions.items(): for adef in adefinitions: if adef.prop_name in ignored_prop_names[adef.name]: continue definition_dict[adef.idx][adef.name][adef.numberstr].append(aid) # sort by definition index pairs = sorted(list(definition_dict.items()), key=lambda x: x[0]) if pairs: dicts = list(zip(*pairs))[1] else: dicts = () # for each definition index flatten, sort by definition name, # and sort by numberstr definition_dict = OrderedDict() for adict in dicts: pairs = [] for key, value in adict.items(): tmp_pairs = sorted(list(value.items()), key=lambda x: x[0]) tmp_values = list(zip(*tmp_pairs))[1] pairs.append((key, tmp_values)) pairs = sorted(pairs, key=lambda x: x[0]) definition_dict.update(pairs) self.trimDefinitionDict(definition_dict) return definition_dict
[docs] def areContiguous(self, indices): """ Return True if for the given indices the atoms are contiguously bonded, False otherwise. :type indices: tuple :param indices: the indices to check """ if len(indices) < 2: return True # This method using networkx is (nearly) infinitely faster than # extracting a structure of these atoms and checking the number of # molecules, mainly because structure extract time depends on the size # of the PARENT structure, not the extracted structure. graph = analyze.create_nx_graph(self.astructure, atoms=indices) if not graph: # No graph means no bonds between these atoms return False return networkx.is_connected(graph)
[docs] def getParticles(self, particles, aname, acolor, aatom_type, akey): """ Return list of particle objects. :type particles: list :param particles: contains particle indices as lists of atom indices :type aname: str :param aname: name for the particles :type acolor: tuple :param acolor: a triple of integers in [0, 255] that give an RGB color for the particles :type aatom_type: int :param aatom_type: atom type for the particles :type akey: str :param akey: a structure property key indicating the parent structure string :rtype: list :return: contains Particle instances """ # some data is shared but only the first particle gets the # parent structure and structure string set as this can be # a lot of information to have to store for the many particle # instances try: # Precompute the pbc that will be needed for all particles. This # saves a tiny amount of time, but also saves book-keeping of PBC # structure properties for the particle structures. pbc = clusterstruct.create_pbc(self.astructure) except ValueError: pbc = None parent_indices = particles[0] ref_particle = Particle() ref_particle.setParentIndices(parent_indices) ref_particle.setParentStructure(self.astructure) ref_particle.setParentStructureStr() ref_particle.setXYZ(astructure=self.astructure, pbc=pbc) ref_particle.setVDWRadius() ref_particle.setAtomicWeight() ref_particle.setFormalCharge() ref_particle.setPartialCharge() ref_particle.setParentCutBonds(self.astructure) ref_particle.setFormula() ref_particle.setName(aname) ref_particle.setRGBColor(acolor) ref_particle.setAtomType(aatom_type) ref_particle.setKey(akey) particle_objs = [ref_particle] for parent_indices in particles[1:]: aparticle = Particle() aparticle.setParentIndices(parent_indices) aparticle.setXYZ(astructure=self.astructure, pbc=pbc) aparticle.setVDWRadius(vdw_radius=ref_particle.vdw_radius) aparticle.setAtomicWeight(atomic_weight=ref_particle.atomic_weight) aparticle.setFormalCharge(formal_charge=ref_particle.formal_charge) aparticle.setPartialCharge( partial_charge=ref_particle.partial_charge) aparticle.setParentCutBonds(self.astructure) aparticle.setFormula(formula=ref_particle.formula) aparticle.setName(aname) aparticle.setRGBColor(acolor) aparticle.setAtomType(aatom_type) aparticle.setKey(akey) particle_objs.append(aparticle) return particle_objs
[docs] def inheritProperties(self, cg_structure): """ Inherit properties for the CG model from the parent structure. :type cg_structure: structure.Structure :param cg_structure: the CG model """ for prop in INHERITABLE_PROPERTIES: value = self.astructure.property.get(prop, None) if value is not None: cg_structure.property[prop] = value
[docs] def markParticleAtoms(self, particle_index, atom_indices): """ Mark the atoms in the structure by particle index. :type particle_index: int :param particle_index: the particle index :type atom_indices: list :param atom_indices: contains the atom indices for this particle """ for idx in atom_indices: smartsutils.append_property(self.astructure.atom[idx], CG_PARTICLE_INDICES, particle_index)
[docs] def buildCGFromParticles(self, all_particles, allow_unassigned=False): """ Build coarse grain structure from the particle :param list all_particles: list of lists of `Particles`, where each list corresponds to each mapping group :param bool allow_unassigned: If False particles not mapped will be ignored, if True they will be ignored leading to no CG bond creation. :rtype: structure.Structure :return: the new CG model """ # set up coarse grain model structure cg_structure = structure.create_new_structure() self.inheritProperties(cg_structure) coarsegrain.set_as_coarse_grain(cg_structure) # define coarse grain model atoms and mark atoms in the parent # structure to assist in defining the bonding in the coarse # grain model for particles in all_particles: sample_particle = particles[0] name = sample_particle.name prop_name = coarsegrain.CG_PARTICLE_KEY % name parent_structure_str = sample_particle.parent_structure_str cg_structure.property[prop_name] = parent_structure_str for particle in particles: atom = cg_structure.addAtom('C', *particle.xyz) particle.defineAtom(cg_structure, atom.index) self.markParticleAtoms(atom.index, particle.parent_indices) # define bonding in coarse grain model afunc = lambda x: get_cg_particle_indices(self.astructure.atom[x]) for particles in all_particles: for particle in particles: for a_idx_st, b_idx_st, order in particle.parent_cut_bonds: a_idxs_cg, b_idxs_cg = list(map(afunc, [a_idx_st, b_idx_st])) if not a_idxs_cg or not b_idxs_cg: # In case the atom was not assigned a CG map. Raise # error only when unassigned are not allowed. if allow_unassigned: continue else: raise ValueError( f'CG bond between atom {a_idx_st} and ' f'{b_idx_st} cannot be created as they are not ' 'mapped.') # for cases where atoms belong to multiple CG particles prevent # bonding a CG particle to itself a_idxs_cg = list(set(a_idxs_cg).difference(b_idxs_cg)) for a_idx_cg in a_idxs_cg: for b_idx_cg in b_idxs_cg: if not cg_structure.areBound(a_idx_cg, b_idx_cg): cg_structure.addBond(a_idx_cg, b_idx_cg, CG_BOND_ORDER) return cg_structure
[docs] def getAssignedAndUnassigned(self, all_particles): """ Get currently mapped (assigned) and upmapped (unassigned) atoms in the all-atom structure. :param list all_particles: list of lists of `Particles`, where each list corresponds to each mapping group :rtype: tuple(set, set) :return: tuple where first element is indexes of all-atom atoms that have been assigned mapping scheme and second are the atoms that have not been assigned any mapping scheme """ unassigned_atoms = set(range(1, self.astructure.atom_total + 1)) assigned_atoms = set() for particles in all_particles: assigned_atoms.update( msutils.flatten(particles, afunc=lambda x: getattr(x, 'parent_indices'))) unassigned_atoms.difference_update(assigned_atoms) return assigned_atoms, unassigned_atoms
[docs] def categorizeBonds(self): """ Return a dictionary of categorized bonds. :rtype: OrderedDict :return: sorted keys where keys are pair tuples of coarse grain particle names, values are OrderedDicts where keys are (SMILES, chiral_hash) tuples and values are lists of CoarseGrainBond """ all_categorized_cg_bonds = OrderedDict() obj = coarsegrain.Internals(self.cg_structure) obj.setBonds() for (cg_name_1, cg_name_2), data in obj.bonds.items(): categorized_cg_bonds = OrderedDict() for cg_idx_1, cg_idx_2 in data[0]: cg_bond = CoarseGrainBond(cg_name_1, cg_idx_1, cg_name_2, cg_idx_2) cg_bond.setParentData(self.cg_structure) cg_bond.setChiralHash(self.astructure) # The atomistic bond is examined for the bond directional properties # If atomistic bond has directional info, we pass it to the cg # bond, and skip the assigning bond direction stages. # If atomistic bond doesn't have directional info, we ask users # to define parent_bonds = cg_bond.parent_bonds # No bond direction if there is no parent bond and hence won't # be counted towards categorized_cg_bonds if not parent_bonds: continue atomic_bond = self.astructure.getBond(*parent_bonds[0][:-1]) for prop_key in [ coarsegrain.CG_UNIQUE_BOND_PARTICLE_1_LABEL_KEY, coarsegrain.CG_UNIQUE_BOND_PARTICLE_2_LABEL_KEY ]: value = atomic_bond.property.get(prop_key) if not value: # This cg_bond cannot find atomistic bond direction, # and is counted toward categorized_cg_bonds break self.cg_structure.getBond( cg_bond.idx_1, cg_bond.idx_2).property[prop_key] = value else: # cg_bond sets bond direction based atomistic bond property continue cg_bond.setParentBondsHash(self.astructure) if cg_bond.parent_bonds_hash and cg_bond.chiral_hash: atuple = (cg_bond.parent_bonds_hash, cg_bond.chiral_hash) categorized_cg_bonds.setdefault(atuple, []).append(cg_bond) # now dedup using SMILES which is necessary because when using # UNIQUE_BOND_DICT bonds may be categorized differently even though # they are in fact symmetrically equivalent, for example consider # (1) ala-arg-ala-arg, if entire residues are particles then there # are two types of ala-arg bonds if on the other hand the backbones # are particles then with UNIQUE_BOND_DICT there will still be two # types of ala(backbone)-arg(backbone) bonds even though they are # symmetrically equivalent, or (2) terpy ligand, with UNIQUE_BOND_DICT # there will be two types of phen-phen bonds even though they are # symmetrically equivalent, only the first occurrence, i.e. pairs[0], # needs to be considered f_categorized_cg_bonds = OrderedDict() for pairs in categorized_cg_bonds.values(): pairs[0].setSmiles(self.astructure) atuple = (pairs[0].smiles, pairs[0].chiral_hash) f_categorized_cg_bonds.setdefault(atuple, []).extend(pairs) # only keep pairs with two or more distinct bonds, for those # define the default particle labels if len(f_categorized_cg_bonds) >= 2: for pairs in f_categorized_cg_bonds.values(): for pair in pairs: pair.setDefaultParticleLabels(self.astructure) all_categorized_cg_bonds[(cg_name_1, cg_name_2)] = \ f_categorized_cg_bonds return all_categorized_cg_bonds
[docs]class CGMapper(CGMapperMixin): """ Class create a mapped coarse grain model from an all-atom structure. """ TYPE_MAX = 400
[docs] def __init__(self, struct): """ Initiate CGMapper class. :param structure.Structure struct: All-atom structure to be mapped to CG """ self.astructure = struct synced_astructure = xtal.sync_pbc(self.astructure) if synced_astructure: self.astructure = synced_astructure if msutils.is_coarse_grain(self.astructure): raise TypeError('The structure is already a coarse-grained model.')
[docs] def getMappedParticles(self): """ Create `Particles` for the assigned mapping scheme. :rtype: list(Particles) :return: list of lists of `Particles`, where each list corresponds to each mapping group """ particles = [] for count, (aname, aindexes) in enumerate(self.definition_dict.items()): akey = coarsegrain.CG_PARTICLE_KEY % aname aatom_type = self.TYPE_MAX - count aparticles = [] for indexes in aindexes: if not self.areContiguous(indexes): raise NonContiguousBondingError() else: aparticles.append(indexes) cur_particles = self.getParticles(aparticles, aname, MOLECULE_COLORS[count], aatom_type, akey) particles.append(cur_particles) return particles
[docs] def validateAllAssigned(self, all_particles): """ Check if all the atoms in the structure have been assigned mapping scheme. :param list all_particles: list of lists of `Particles`, where each list corresponds to each mapping group :raise: ValueError if all particle have not been assigned names """ _, unassigned_atoms = self.getAssignedAndUnassigned(all_particles) num_unassigned_atoms = len(unassigned_atoms) if num_unassigned_atoms != 0: raise ValueError(f'There are {num_unassigned_atoms} atoms ' 'unassigned.')
[docs] def markCGBonds(self): """ CG bonds that are mapped to same named atoms but resulted from different all-atom bonding environment will be marked as unique. Thus, resulting in individual parameters for each. """ key_1 = coarsegrain.CG_UNIQUE_BOND_PARTICLE_1_LABEL_KEY key_2 = coarsegrain.CG_UNIQUE_BOND_PARTICLE_2_LABEL_KEY aformat = CoarseGrainBond.formatParticleLabel categorized_bonds = self.categorizeBonds() for name_pair, adict in categorized_bonds.items(): for cg_bonds in adict.values(): for cg_bond in cg_bonds: abond = self.cg_structure.getBond(cg_bond.idx_1, cg_bond.idx_2) abond.property[key_1] = aformat(cg_bond.label_1) abond.property[key_2] = aformat(cg_bond.label_2)
[docs] def mapCGStructure(self, cg_names, allow_unassigned, unique_bonds): """ Map the passed cg_names to a coarse grain structure and build it from the all-atom structure. :param list: list of cg_names to be mapped :param bool allow_unassigned: if True, not all atoms are required to be mapped to the CG model. In case of False a check would be done and ValueError will be raised if all atoms were not assigned mapping :param bool unique_bonds: if True CG bonds that are mapped to same named atoms but resulted from different all-atom bonding environment will be marked as unique. If False the bonds will be considered indistinguishable """ definition_dict = self.getDefinitionDict() # Only take use cg_names in the definition self.definition_dict = { name: definition_dict[name] for name in cg_names } # Define Particles particles = self.getMappedParticles() if not allow_unassigned: self.validateAllAssigned(particles) # Build CG structure from particles cg_structure = self.buildCGFromParticles(particles, allow_unassigned) cg_structure.applyTubeStyle() title = self.astructure.title.strip() cg_structure.title = title + CG_TAG self.cg_structure = cg_structure # Categorized bonds if unique_bonds: self.markCGBonds()