Source code for schrodinger.protein.pdbname

# -*- coding: utf-8 -*-

__doc__ = """
Uses SMARTS matching to set PDB atom and residue names for a structure.
Also re-numbers residues, and optionally adds bond orders.

Copyright Schrodinger, LLC. All rights reserved.
"""
import json
import os

from rdkit import Chem

from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.structutils import assignbondorders
from schrodinger.structutils import build
from schrodinger.thirdparty import rdkit_adapter
from schrodinger.utils import fileutils

RESIDUE_PATTERNS_FILE = os.path.join(fileutils.get_mmshare_data_dir(), 'res',
                                     'residue_patterns.json')
with open(RESIDUE_PATTERNS_FILE) as f:
    RESIDUE_PATTERNS = json.load(f)

BLANK_PDBRES = ' ' * 4
PEPTIDIC_HEAD_SMARTS = r'NCC(=O)'
PEPTIDIC_HEAD_PDBNAMES = (' N  ', ' CA ', ' C  ', ' O  ')


[docs]def find_oxt_atom(atoms):
    """
    Given a list of atom objects, returns the OT atom (oxygen of the -COOH)
    bound to the C-termini atom of the group.  If no such atom is found,
    None is returned.

    TODO: Extend to include other termini atoms.
    """
    for atomobj in atoms:
        if atomobj.pdbname == ' C  ':
            # Find a match for Cα-C(=O)X.
            for bond in atomobj.bond:
                if bond.order == 1 and bond.atom2.element == 'O':
                    # The terminal oxygen found
                    return bond.atom2
    return None


[docs]def find_n1_atom(atoms):
    """
    Given a list of atoms, returns the N1 atom (N-terminus backbone nitrogen).
    If no such atom is found, None is returned.
    This function only works when the atom pdbname is properly named.

    :param atoms: List of atoms in which to find N1
    :type atoms: list[schrodinger.structure._StructureAtom]

    :return: The N1 atom
    :rtype: schrodinger.structure._StructureAtom
    """

    for atom in atoms:
        if atom.pdbname in (' N  ', ' N1 '):
            nh_count = 0
            backbone = 0
            nc_count = 0
            for b_atom in atom.bonded_atoms:
                if b_atom.pdbname == ' CA ':
                    backbone += 1
                    nc_count += 1
                if b_atom.pdbname == ' C  ':
                    nc_count += 1
            if backbone == 1 and nc_count < 2:
                return atom


def _rdk_atom_to_st_atom(st, rdk_atom):
    st_idx = int(rdk_atom.GetProp(rdkit_adapter.adapter.SCHRODINGER_INDEX))
    return st.atom[st_idx]


def _get_st_atoms_match(mol, smarts, rdk_idx_to_st_atom, atoms_to_assign,
                        labeled_atoms):
    query_mol = Chem.MolFromSmarts(smarts)
    matches = mol.GetSubstructMatches(query_mol)
    for rdk_match in matches:
        match = [rdk_idx_to_st_atom[rdk_idx] for rdk_idx in rdk_match]

        if not atoms_to_assign.issuperset(match):
            continue

        # if any atom in the match has already been labeled,
        # then we should not consider this a match
        if len(labeled_atoms.intersection(match)) != 0:
            continue

        yield match


def _find_backbone(mol, rdk_idx_to_st_atom, backbone_detection_size):
    """
    Helper function to assign_pdb_names which finds the RDKit indexes of
    the atoms along a backbone in the given mol.

    Note: this will not match any BALA residues, as these have abnormal
    peptidic head with 5 atoms instead of the usual 4.

    :param mol: The RDKit Mol on which to search for backbone atoms.

    :returns: a tuple of lists. The first list contains the indexes of
        the atoms which are part of a backbone. The second list contains
        the indexes of the terminal carbonyls in the backbone.
    """

    backbone_smarts = PEPTIDIC_HEAD_SMARTS * backbone_detection_size
    backbone_query = Chem.MolFromSmarts(backbone_smarts)

    potential_terminal_carbons = set()
    non_terminal_backbone = set()
    for match in mol.GetSubstructMatches(backbone_query):
        non_terminal_backbone |= set(match[:-2])
        non_terminal_backbone.add(match[-1])
        potential_terminal_carbons.add(match[-2])

    terminal_carbonyls = {
        rdk_idx_to_st_atom[rdk_idx] for rdk_idx in
        potential_terminal_carbons.difference(non_terminal_backbone)
    }
    all_backbone_atoms = {
        rdk_idx_to_st_atom[rdk_idx]
        for rdk_idx in non_terminal_backbone.union(potential_terminal_carbons)
    }

    return all_backbone_atoms, terminal_carbonyls


def _label_matched_residue(rename_residues, rename_atoms, st, pdbres,
                           atom_names, match, labeled_atoms):
    seen_names = set()
    for at, atom_name in zip(match, atom_names):
        if atom_name in seen_names:
            raise ValueError(
                f"Residue {at.resnum}:{pdbres} already has an atom name '{atom_name}'"
            )

        if rename_residues:
            at.pdbres = pdbres

        if rename_atoms:
            at.pdbname = atom_name
            if atom_name != BLANK_PDBRES:
                seen_names.add(atom_name)

        mm.mmhtreat_add_h_pdb_names(st.handle, at.index)

        labeled_atoms.add(at)


def _label_terminal_OXT(rename_residues, rename_atoms, c_at, backbone_atoms,
                        labeled_atoms):
    if c_at.pdbname != ' C  ':
        return

    for nbr in c_at.bonded_atoms:
        if nbr.atomic_number != 8 or nbr in backbone_atoms:
            continue

        if rename_residues:
            nbr.pdbres = c_at.pdbres
        if rename_atoms:
            nbr.pdbname = ' OXT'
        labeled_atoms.add(nbr)
        return


def _label_aa(rename_residues, rename_atoms, resolve_his, st, mol,
              rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms,
              terminal_carbonyls, labeled_atoms):
    for pdbres, smarts, atom_names in RESIDUE_PATTERNS['aa_patterns']:

        matches = list(
            _get_st_atoms_match(mol, smarts, rdk_idx_to_st_atom,
                                atoms_to_assign, labeled_atoms))
        for match in matches:

            # The match must have an aminoacid head in the backbone.
            # This may cause trouble with NCAA with more than one peptidic head
            if len(backbone_atoms.intersection(match)) != 4:
                continue

            if not resolve_his and pdbres in ('HID ', 'HIE ', 'HIP '):
                pdbres = 'HIS '

            # Check for disulfide-bridged-CYS residues.
            elif pdbres == 'CYT ':
                # We pop() the "other" S atom from this match so that the
                # other match does not get marked as labeled.
                other_CYS_s_atom = match.pop()

                # Check that the "other S" atom is matched in exactly one
                # other CYT residue. Else, clear this match, so that it
                # does not provide a "other" atom for any other CYT, and
                # we can label them as unknown residues later on.
                count = sum(1 for m in matches if other_CYS_s_atom in m)
                if count != 1:
                    continue

            _label_matched_residue(rename_residues, rename_atoms, st, pdbres,
                                   atom_names, match, labeled_atoms)

            # Check if this match includes a terminal carboxyl, and if so,
            # assign the OXT atom name
            matched_terminal_carbonyl = terminal_carbonyls.intersection(match)
            if len(matched_terminal_carbonyl) == 1:
                _label_terminal_OXT(rename_residues, rename_atoms,
                                    matched_terminal_carbonyl.pop(),
                                    backbone_atoms, labeled_atoms)


def _label_aa_caps(rename_residues, rename_atoms, resolve_his, st, mol,
                   rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms,
                   terminal_carbonyls, labeled_atoms):

    for pdbres, smarts, atom_names in RESIDUE_PATTERNS['aa_cap_patterns']:

        for match in _get_st_atoms_match(mol, smarts, rdk_idx_to_st_atom,
                                         atoms_to_assign, labeled_atoms):

            _label_matched_residue(rename_residues, rename_atoms, st, pdbres,
                                   atom_names, match, labeled_atoms)


[docs]def recurse_neighbors(at, residue_atoms, atoms_to_assign, labeled_atoms,
                      backbone_atoms):
    for nbr in at.bonded_atoms:
        if nbr in atoms_to_assign and nbr not in labeled_atoms and \
                nbr not in backbone_atoms and nbr not in residue_atoms:
            residue_atoms.add(nbr)
            recurse_neighbors(nbr, residue_atoms, atoms_to_assign,
                              labeled_atoms, backbone_atoms)


def _label_unknown_residues(rename_residues, rename_atoms, resolve_his, st, mol,
                            rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms,
                            terminal_carbonyls, labeled_atoms):

    for match in _get_st_atoms_match(mol, PEPTIDIC_HEAD_SMARTS,
                                     rdk_idx_to_st_atom, atoms_to_assign,
                                     labeled_atoms):

        if len(backbone_atoms.intersection(match)) != 4:
            continue

        # Check whether there is a peptoid starting from the N atom,
        # and then check for a sidechain coming out of the alpha carbon
        residue_atoms = set(match)
        for at in match[:2]:
            recurse_neighbors(at, residue_atoms, atoms_to_assign, labeled_atoms,
                              backbone_atoms)

        for at in residue_atoms:
            if rename_residues:
                at.pdbres = 'UNK '
            if rename_atoms:
                try:
                    idx = match.index(at)
                except ValueError:
                    at.pdbname = '    '
                else:
                    at.pdbname = PEPTIDIC_HEAD_PDBNAMES[idx]
            labeled_atoms.add(at)

        # Check if this match includes a terminal carboxyl, and if so,
        # assign the OXT atom name
        matched_terminal_carbonyl = terminal_carbonyls.intersection(match)
        if len(matched_terminal_carbonyl) == 1:
            _label_terminal_OXT(rename_residues, rename_atoms,
                                matched_terminal_carbonyl.pop(), backbone_atoms,
                                labeled_atoms)


[docs]def assign_pdb_names(st,
                     *,
                     selected_residues=None,
                     rename_residues=True,
                     rename_atoms=True,
                     resolve_his=False,
                     backbone_detection_size=3):
    """
    Assign PDB residue and/or atom names to struct.  This is based on
    substructure definitions for standard residues.

    Notes:

    - Only residues attached to a backbone chain will be assigned names.
    - An attempt will be made to identify side chains for amino acids and
      peptoids not in the patterns list. These will be labeled "UNK ".
    - PDB atom names for multiple Hydrogens attached to the same heavy atom
      will not follow the pro-R numbering rule, but instead H numbers will
      follow atom index order.

    :type st: schrodinger.Structure
    :param st: structure on which to work.

    :type selected_residues: list or None
    :param selected_residues: The list of Structure._Residue objects on
        which the assigning will take place. If None, all the structure
        will be used

    :type rename_residues: bool
    :param rename_residues: Whether to assign PDB residue names.

    :type rename_atoms: bool
    :param rename_atoms: Whether to assign PDB atom names.

    :type resolve_his: bool
    :param resolve_his: Whether HIS residues should be resolved into HID,
        HIE or HIP.

    :type backbone_detection_size: int
    :param backbone_detection_size: minimum numner of consecutive residues
        to be considered a backbone. By default, mono and dipeptides will
        be ignored.
    """

    if not rename_residues and not rename_atoms:
        return
    if backbone_detection_size < 0:
        return

    if selected_residues is None:
        atoms_to_assign = {at for at in st.atom}
    else:
        atoms_to_assign = {at for res in selected_residues for at in res.atom}

    mol = rdkit_adapter.to_rdkit(st,
                                 implicitH=True,
                                 include_properties=False,
                                 include_coordinates=False,
                                 sanitize=False,
                                 include_stereo=False)

    Chem.SanitizeMol(mol, Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES)

    rdk_idx_to_st_atom = {
        rdk_atom.GetIdx(): _rdk_atom_to_st_atom(st, rdk_atom)
        for rdk_atom in mol.GetAtoms()
    }

    backbone_atoms, terminal_carbonyls = _find_backbone(
        mol, rdk_idx_to_st_atom, backbone_detection_size)

    labeled_atoms = set()

    # Only attempt to match AAs if we found a backbone
    if backbone_atoms:
        _label_aa(rename_residues, rename_atoms, resolve_his, st, mol,
                  rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms,
                  terminal_carbonyls, labeled_atoms)

        # Once we have matched all known amino acids, then label any unknown
        # backbone residues
        _label_unknown_residues(rename_residues, rename_atoms, resolve_his, st,
                                mol, rdk_idx_to_st_atom, atoms_to_assign,
                                backbone_atoms, terminal_carbonyls,
                                labeled_atoms)

        # Once everything in the backbone has been labeled, find caps
        _label_aa_caps(rename_residues, rename_atoms, resolve_his, st, mol,
                       rdk_idx_to_st_atom, atoms_to_assign, backbone_atoms,
                       terminal_carbonyls, labeled_atoms)


def _ca_neighbor_present(atom):
    """
    Return True if the given atom is bound to a CA atom.
    """
    for bonded in atom.bonded_atoms:
        if bonded.pdbname == ' CA ':
            return True
    return False


def _find_inter_res_CN_bonds(chain):
    """
    Iterate over C-N bonds between each pair of neighboring residues (those
    bonded to each other).
    """
    for atom1 in chain.atom:
        if atom1.pdbname == ' N  ':
            for bond in atom1.bond:
                atom2 = bond.atom2
                if atom2.pdbname == ' C  ' and bond.order == 1:
                    if _ca_neighbor_present(atom1) and _ca_neighbor_present(
                            atom2):
                        yield atom1, atom2


def _get_atoms_in_nc_order(chain):
    """
    Group the atoms in the given chain by residue, in the N->C order.
    """

    # If multiple residues have the same name/type and same number,
    # our Structure.residue iterator will treat them as a single residue;
    # so we need first "break" them apart.
    break_bonds = [
        (atom1, atom2)
        for atom1, atom2 in _find_inter_res_CN_bonds(chain)
        if atom1.resnum == atom2.resnum and atom1.inscode == atom2.inscode
    ]

    if break_bonds:
        st = chain.structure.copy()
        chain = st.chain[chain.name]

        for atom1, atom2 in break_bonds:
            st.deleteBond(atom1.index, atom2.index)

        # Give each residue a unique number, as it's possible that multiple
        # residues share the same number in the input, which will cause our
        # libraries to treat them as the same residue.
        for i, res in enumerate(st.residue, start=1):
            res.resnum = i
            res.inscode = ' '

        # Now re-add the bonds, so that connectivity order can be detected:
        for atom1, atom2 in break_bonds:
            st.addBond(atom1.index, atom2.index, 1)

    # Now iterate over residues in the N->C connectivity order.
    # NOTE: This API will work only if each input residue already as a unique
    # residue number
    return [
        res.getAtomIndices()
        for res in structure.get_residues_by_connectivity(chain)
    ]


[docs]def renumber_residues(struct, chains):
    """
    Renumber all residues to have a unique residue number and have residue
    numbers that are contiguous.

    :type struct: list[schrodinger.structure.Structure]
    :param struct: The structure to operate on.  This structure is modified
        in-place.

    :type chains: list or 'all'
    :param chains: A list of chains to renumber, or the string 'all' if all
        chains should be fixed.
    """

    for chain in struct.chain:
        if chains == 'all' or chain.name in chains:
            ordered_res_atoms = _get_atoms_in_nc_order(chain)

            # Now modify the original structure according to the new order:
            for resnum, res_atoms in enumerate(ordered_res_atoms, start=1):
                for aindex in res_atoms:
                    atom = struct.atom[aindex]
                    atom.resnum = resnum
                    atom.inscode = ' '


[docs]def process_structure(struct, renumber_chains=None, assign_bond_orders=False):
    if assign_bond_orders:
        assignbondorders.assign_st(struct)

    build.add_hydrogens(struct)

    assign_pdb_names(struct)

    if renumber_chains:
        renumber_residues(struct, renumber_chains)