Source code for schrodinger.protein.pdbname

# -*- coding: utf-8 -*-

__doc__ = """
Uses SMARTS matching to set PDB atom and residue names for a structure.
Also re-numbers residues, and optionally adds bond orders.

Copyright Schrodinger, LLC. All rights reserved.
"""
import json
import os

from rdkit import Chem

from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.structutils import assignbondorders
from schrodinger.structutils import build
from schrodinger.thirdparty import rdkit_adapter
from schrodinger.utils import fileutils

RESIDUE_PATTERNS_FILE = os.path.join(fileutils.get_mmshare_data_dir(), 'res',
                                     'residue_patterns.json')
with open(RESIDUE_PATTERNS_FILE) as f:
    RESIDUE_PATTERNS = json.load(f)

BLANK_PDBRES = ' ' * 4
PEPTIDIC_HEAD_SMARTS = r'NCC(=O)'
PEPTIDIC_HEAD_PDBNAMES = (' N  ', ' CA ', ' C  ', ' O  ')


[docs]def find_oxt_atom(atoms): """ Given a list of atom objects, returns the OT atom (oxygen of the -COOH) bound to the C-termini atom of the group. If no such atom is found, None is returned. TODO: Extend to include other termini atoms. """ for atomobj in atoms: if atomobj.pdbname == ' C ': # Find a match for Cα-C(=O)X. for bond in atomobj.bond: if bond.order == 1 and bond.atom2.element == 'O': # The terminal oxygen found return bond.atom2 return None
[docs]def find_n1_atom(atoms): """ Given a list of atoms, returns the N1 atom (N-terminus backbone nitrogen). If no such atom is found, None is returned. This function only works when the atom pdbname is properly named. :param atoms: List of atoms in which to find N1 :type atoms: list[schrodinger.structure._StructureAtom] :return: The N1 atom :rtype: schrodinger.structure._StructureAtom """ for atom in atoms: if atom.pdbname in (' N ', ' N1 '): nh_count = 0 backbone = 0 nc_count = 0 for b_atom in atom.bonded_atoms: if b_atom.pdbname == ' CA ': backbone += 1 nc_count += 1 if b_atom.pdbname == ' C ': nc_count += 1 if backbone == 1 and nc_count < 2: return atom
def _rdk_atom_to_st_atom(st, rdk_atom): st_idx = int(rdk_atom.GetProp(rdkit_adapter.adapter.SCHRODINGER_INDEX)) return st.atom[st_idx] def _get_st_atoms_match(mol, smarts, rdk_idx_to_st_atom, atoms_to_assign, labeled_atoms): query_mol = Chem.MolFromSmarts(smarts) matches = mol.GetSubstructMatches(query_mol) for rdk_match in matches: match = [rdk_idx_to_st_atom[rdk_idx] for rdk_idx in rdk_match] if not atoms_to_assign.issuperset(match): continue # if any atom in the match has already been labeled, # then we should not consider this a match if len(labeled_atoms.intersection(match)) != 0: continue yield match def _find_backbone(mol, rdk_idx_to_st_atom, backbone_detection_size): """ Helper function to assign_pdb_names which finds the RDKit indexes of the atoms along a backbone in the given mol. Note: this will not match any BALA residues, as these have abnormal peptidic head with 5 atoms instead of the usual 4. :param mol: The RDKit Mol on which to search for backbone atoms. :returns: a tuple of lists. The first list contains the indexes of the atoms which are part of a backbone. The second list contains the indexes of the terminal carbonyls in the backbone. """ backbone_smarts = PEPTIDIC_HEAD_SMARTS * backbone_detection_size backbone_query = Chem.MolFromSmarts(backbone_smarts) potential_terminal_carbons = set() non_terminal_backbone = set() for match in mol.GetSubstructMatches(backbone_query): non_terminal_backbone |= set(match[:-2]) non_terminal_backbone.add(match[-1]) potential_terminal_carbons.add(match[-2]) terminal_carbonyls = { rdk_idx_to_st_atom[rdk_idx] for rdk_idx in potential_terminal_carbons.difference(non_terminal_backbone) } all_backbone_atoms = { rdk_idx_to_st_atom[rdk_idx] for rdk_idx in non_terminal_backbone.union(potential_terminal_carbons) } return all_backbone_atoms, terminal_carbonyls def _label_matched_residue(rename_residues, rename_atoms, st, pdbres, atom_names, match, labeled_atoms): seen_names = set() for at, atom_name in zip(match, atom_names): if atom_name in seen_names: raise ValueError( f"Residue {at.resnum}:{pdbres} already has an atom name '{atom_name}'" ) if rename_residues: at.pdbres = pdbres if rename_atoms: at.pdbname = atom_name if atom_name != BLANK_PDBRES: seen_names.add(atom_name) mm.mmhtreat_add_h_pdb_names(st.handle, at.index) labeled_atoms.add(at) def _label_terminal_OXT(rename_residues, rename_atoms, c_at, backbone_atoms, labeled_atoms): if c_at.pdbname != ' C ': return for nbr in c_at.bonded_atoms: if nbr.atomic_number != 8 or nbr in backbone_atoms: continue if rename_residues: nbr.pdbres = c_at.pdbres if rename_atoms: nbr.pdbname = ' OXT' labeled_atoms.add(nbr) return def _label_aa(rename_residues, rename_atoms, resolve_his, st, mol, rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms, terminal_carbonyls, labeled_atoms): for pdbres, smarts, atom_names in RESIDUE_PATTERNS['aa_patterns']: matches = list( _get_st_atoms_match(mol, smarts, rdk_idx_to_st_atom, atoms_to_assign, labeled_atoms)) for match in matches: # The match must have an aminoacid head in the backbone. # This may cause trouble with NCAA with more than one peptidic head if len(backbone_atoms.intersection(match)) != 4: continue if not resolve_his and pdbres in ('HID ', 'HIE ', 'HIP '): pdbres = 'HIS ' # Check for disulfide-bridged-CYS residues. elif pdbres == 'CYT ': # We pop() the "other" S atom from this match so that the # other match does not get marked as labeled. other_CYS_s_atom = match.pop() # Check that the "other S" atom is matched in exactly one # other CYT residue. Else, clear this match, so that it # does not provide a "other" atom for any other CYT, and # we can label them as unknown residues later on. count = sum(1 for m in matches if other_CYS_s_atom in m) if count != 1: continue _label_matched_residue(rename_residues, rename_atoms, st, pdbres, atom_names, match, labeled_atoms) # Check if this match includes a terminal carboxyl, and if so, # assign the OXT atom name matched_terminal_carbonyl = terminal_carbonyls.intersection(match) if len(matched_terminal_carbonyl) == 1: _label_terminal_OXT(rename_residues, rename_atoms, matched_terminal_carbonyl.pop(), backbone_atoms, labeled_atoms) def _label_aa_caps(rename_residues, rename_atoms, resolve_his, st, mol, rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms, terminal_carbonyls, labeled_atoms): for pdbres, smarts, atom_names in RESIDUE_PATTERNS['aa_cap_patterns']: for match in _get_st_atoms_match(mol, smarts, rdk_idx_to_st_atom, atoms_to_assign, labeled_atoms): _label_matched_residue(rename_residues, rename_atoms, st, pdbres, atom_names, match, labeled_atoms)
[docs]def recurse_neighbors(at, residue_atoms, atoms_to_assign, labeled_atoms, backbone_atoms): for nbr in at.bonded_atoms: if nbr in atoms_to_assign and nbr not in labeled_atoms and \ nbr not in backbone_atoms and nbr not in residue_atoms: residue_atoms.add(nbr) recurse_neighbors(nbr, residue_atoms, atoms_to_assign, labeled_atoms, backbone_atoms)
def _label_unknown_residues(rename_residues, rename_atoms, resolve_his, st, mol, rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms, terminal_carbonyls, labeled_atoms): for match in _get_st_atoms_match(mol, PEPTIDIC_HEAD_SMARTS, rdk_idx_to_st_atom, atoms_to_assign, labeled_atoms): if len(backbone_atoms.intersection(match)) != 4: continue # Check whether there is a peptoid starting from the N atom, # and then check for a sidechain coming out of the alpha carbon residue_atoms = set(match) for at in match[:2]: recurse_neighbors(at, residue_atoms, atoms_to_assign, labeled_atoms, backbone_atoms) for at in residue_atoms: if rename_residues: at.pdbres = 'UNK ' if rename_atoms: try: idx = match.index(at) except ValueError: at.pdbname = ' ' else: at.pdbname = PEPTIDIC_HEAD_PDBNAMES[idx] labeled_atoms.add(at) # Check if this match includes a terminal carboxyl, and if so, # assign the OXT atom name matched_terminal_carbonyl = terminal_carbonyls.intersection(match) if len(matched_terminal_carbonyl) == 1: _label_terminal_OXT(rename_residues, rename_atoms, matched_terminal_carbonyl.pop(), backbone_atoms, labeled_atoms)
[docs]def assign_pdb_names(st, *, selected_residues=None, rename_residues=True, rename_atoms=True, resolve_his=False, backbone_detection_size=3): """ Assign PDB residue and/or atom names to struct. This is based on substructure definitions for standard residues. Notes: - Only residues attached to a backbone chain will be assigned names. - An attempt will be made to identify side chains for amino acids and peptoids not in the patterns list. These will be labeled "UNK ". - PDB atom names for multiple Hydrogens attached to the same heavy atom will not follow the pro-R numbering rule, but instead H numbers will follow atom index order. :type st: schrodinger.Structure :param st: structure on which to work. :type selected_residues: list or None :param selected_residues: The list of Structure._Residue objects on which the assigning will take place. If None, all the structure will be used :type rename_residues: bool :param rename_residues: Whether to assign PDB residue names. :type rename_atoms: bool :param rename_atoms: Whether to assign PDB atom names. :type resolve_his: bool :param resolve_his: Whether HIS residues should be resolved into HID, HIE or HIP. :type backbone_detection_size: int :param backbone_detection_size: minimum numner of consecutive residues to be considered a backbone. By default, mono and dipeptides will be ignored. """ if not rename_residues and not rename_atoms: return if backbone_detection_size < 0: return if selected_residues is None: atoms_to_assign = {at for at in st.atom} else: atoms_to_assign = {at for res in selected_residues for at in res.atom} mol = rdkit_adapter.to_rdkit(st, implicitH=True, include_properties=False, include_coordinates=False, sanitize=False, include_stereo=False) Chem.SanitizeMol(mol, Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES) rdk_idx_to_st_atom = { rdk_atom.GetIdx(): _rdk_atom_to_st_atom(st, rdk_atom) for rdk_atom in mol.GetAtoms() } backbone_atoms, terminal_carbonyls = _find_backbone( mol, rdk_idx_to_st_atom, backbone_detection_size) labeled_atoms = set() # Only attempt to match AAs if we found a backbone if backbone_atoms: _label_aa(rename_residues, rename_atoms, resolve_his, st, mol, rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms, terminal_carbonyls, labeled_atoms) # Once we have matched all known amino acids, then label any unknown # backbone residues _label_unknown_residues(rename_residues, rename_atoms, resolve_his, st, mol, rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms, terminal_carbonyls, labeled_atoms) # Once everything in the backbone has been labeled, find caps _label_aa_caps(rename_residues, rename_atoms, resolve_his, st, mol, rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms, terminal_carbonyls, labeled_atoms)
def _ca_neighbor_present(atom): """ Return True if the given atom is bound to a CA atom. """ for bonded in atom.bonded_atoms: if bonded.pdbname == ' CA ': return True return False def _find_inter_res_CN_bonds(chain): """ Iterate over C-N bonds between each pair of neighboring residues (those bonded to each other). """ for atom1 in chain.atom: if atom1.pdbname == ' N ': for bond in atom1.bond: atom2 = bond.atom2 if atom2.pdbname == ' C ' and bond.order == 1: if _ca_neighbor_present(atom1) and _ca_neighbor_present( atom2): yield atom1, atom2 def _get_atoms_in_nc_order(chain): """ Group the atoms in the given chain by residue, in the N->C order. """ # If multiple residues have the same name/type and same number, # our Structure.residue iterator will treat them as a single residue; # so we need first "break" them apart. break_bonds = [ (atom1, atom2) for atom1, atom2 in _find_inter_res_CN_bonds(chain) if atom1.resnum == atom2.resnum and atom1.inscode == atom2.inscode ] if break_bonds: st = chain.structure.copy() chain = st.chain[chain.name] for atom1, atom2 in break_bonds: st.deleteBond(atom1.index, atom2.index) # Give each residue a unique number, as it's possible that multiple # residues share the same number in the input, which will cause our # libraries to treat them as the same residue. for i, res in enumerate(st.residue, start=1): res.resnum = i res.inscode = ' ' # Now re-add the bonds, so that connectivity order can be detected: for atom1, atom2 in break_bonds: st.addBond(atom1.index, atom2.index, 1) # Now iterate over residues in the N->C connectivity order. # NOTE: This API will work only if each input residue already as a unique # residue number return [ res.getAtomIndices() for res in structure.get_residues_by_connectivity(chain) ]
[docs]def renumber_residues(struct, chains): """ Renumber all residues to have a unique residue number and have residue numbers that are contiguous. :type struct: list[schrodinger.structure.Structure] :param struct: The structure to operate on. This structure is modified in-place. :type chains: list or 'all' :param chains: A list of chains to renumber, or the string 'all' if all chains should be fixed. """ for chain in struct.chain: if chains == 'all' or chain.name in chains: ordered_res_atoms = _get_atoms_in_nc_order(chain) # Now modify the original structure according to the new order: for resnum, res_atoms in enumerate(ordered_res_atoms, start=1): for aindex in res_atoms: atom = struct.atom[aindex] atom.resnum = resnum atom.inscode = ' '
[docs]def process_structure(struct, renumber_chains=None, assign_bond_orders=False): if assign_bond_orders: assignbondorders.assign_st(struct) build.add_hydrogens(struct) assign_pdb_names(struct) if renumber_chains: renumber_residues(struct, renumber_chains)