Source code for schrodinger.protein.findhets

"""
Module for finding 'het' groups in a protein structure. These are the groups
which would typically be represented by HETATOM records in PDB file and
appear as orange when a file is first imported into Maestro. However this
module determines whether a residue represents a 'het' group independently
of any information from the PDB file and thus may be used on strucures which
do not originate from PDB files.

This module defines a "het" as a group of atoms which are bound to each
other and are not in any standard residues. This includes ligands
(including covalently bound ones), ions, cofactors, etc.


Usage::

    hets = findhets.find_hets(st)
    for atoms in hets:
        <do>

Copyright Schrodinger, LLC. All rights reserved.

"""
# Contributors: Matvey Adzhigirey

from schrodinger.structutils import analyze

MAX_BRANCH_ITERATIONS = 15
MAX_HET_SIZE = 200

# List of residue names that should not be treated as hets:
non_het_resnames = [ \
"ALA", "VAL", "PHE", "PRO", "MET", "ILE", "LEU", "ASP",
"GLU", "LYS", "ARG", "SER", "THR", "TYR", "HIS", "CYS",
"ASN", "GLN", "TRP", "GLY", "2AS", "3AH", "5HP", "ACL",
"AIB", "ALM", "ALO", "ALY", "ARM", "ASA", "ASB", "ASK",
"ASL", "ASQ", "AYA", "BHD", "BMT", "BNN", "BUC", "SPC",
"BUG", "C5C", "C6C", "CCS", "CEA", "CHG", "CLE", "CME",
"CSD", "CSO", "CSP", "CSS", "CSW", "CXM", "CY1", "CY3",
"CYG", "CYM", "CYQ", "DAH", "DAL", "DAR", "DAS", "DCY",
"DGL", "DGN", "DHA", "DHI", "DIL", "DIV", "DLE", "DLY",
"DNP", "DPN", "DPR", "DSN", "DSP", "DTH", "DTR", "DTY",
"DVA", "EFC", "FLA", "FME", "GGL", "GLZ", "GMA", "GSC",
"HAC", "HAR", "HIC", "HIP", "HMR", "HPQ", "HTR", "HYP",
"IIL", "IYR", "KCX", "LLP", "LLY", "LTR", "LYM", "LYZ",
"MAA", "MEN", "MHS", "MIS", "MLE", "MPQ", "MSA", "MSE",
"MVA", "NEM", "NEP", "NLE", "NLN", "NLP", "NMC", "OAS",
"OCS", "OMT", "PAQ", "PCA", "PEC", "PHI", "PHL", "PR3",
"PRR", "PTR", "SAC", "SAR", "SCH", "SCS", "SCY", "SEL",
"SEP", "SET", "SHC", "SHR", "SOC", "STY", "SVA", "TIH",
"TPL", "TPO", "TPQ", "TRG", "TRO", "TYB", "TYQ", "TYS",
"TYY", "AGM", "GL3", "SMC", "ASX", "CGU", "CSX", "GLX",
"ACE", "NME", "HID", "HIE", "HIP", "ASH", "GLH", "NMA",
"CYT", "SRO", "THO", "TYO", "CYX", "HOH", "DOD", "TIP",
"A", "C", "G", "T", "U", "+U", "DC", "DG", "DT", "DA", "DU", "GUA", "ADE", "CYT", "THY", # DNA bases
'A3T', 'A5T', 'C3T', 'C5T', 'G3T', 'G5T', 'T3T', 'T5T', 'U3T', 'U5T', # DNA cap bases Ev:60484
'LYN', 'ARN', 'ASH', 'GLH', # Ev:59563
"POT", "HXL", # DNA caps Ev60436
                   ]

# Dictionary that defines possible ionization states for metals:
# Key is the atomic number.
metal_charges = {
    11: [1],  # NA Sodium  +1
    12: [2],  # MG Magnesium +2
    19: [1],  # K  Potassium +1
    20: [2],  # CA Calcium +2
    #25:[2, 3, 4, 5, 6, 7], # MN Manganese +2, +3, +4, +5, +6, +7
    # ^- creates general atom type atoms
    26: [2, 3],  # FE Iron +2 +3
    27: [2, 3],  # CO Cobalt +2 +3
    28: [2, 3],  # NI Nickel +2 +3
    29: [1, 2, 3],  # CU Copper +1 +2 +3
    30: [2]  # ZN Zinc +2
    #   48:[2]      CD Cadmium +2 No MacroModel parameters
    #   80:[1, 2]   HG Mercury +1 +2  No MacroModel parameters
}


class _AddNeighborGroups:
    """
    For each untemplated atom that is bound to a templated
    het, create a list of atoms that it is bound to that are not templated.
    If this list (group) is too large, do not consider it. Otherwise merge
    those atoms with nearby templated hets.
    """

    def __init__(self, st, het_groups):
        # Make a list of all templated atoms:
        self.all_het_atoms = []
        for het_atoms in het_groups:
            self.all_het_atoms.extend(het_atoms)

        self.pseudo_het_groups = []
        # For each templated het:
        for het_atoms in het_groups:
            for ai in het_atoms:
                self.pseudo_het_atoms = []
                if self.nextBranch(st.atom[ai], None,
                                   1) and self.pseudo_het_atoms:
                    # Did not reach threshold and found atoms:
                    self.pseudo_het_groups.append(self.pseudo_het_atoms)

        # Group templated hets and the new pseudo hets by connectivity:
        het_and_neighbor_atoms = []
        for group in (het_groups + self.pseudo_het_groups):
            for ai in group:
                if ai not in het_and_neighbor_atoms:
                    het_and_neighbor_atoms.append(ai)

        self._newhets = analyze.group_by_connectivity(st,
                                                      het_and_neighbor_atoms)

    def nextBranch(self, a, backatom, i):
        """ Return False if reached thresholds """
        if i > MAX_BRANCH_ITERATIONS:
            return False  # Reached threshold
        for b in a.bond:
            if b.order == 0:
                continue  # Ev:69319
            n = b.atom2
            ni = int(n)
            if backatom and ni == int(backatom):
                continue  # So that we don't go back through branch
            if n.atomic_number == 1 or ni in self.all_het_atoms:
                continue
            if ni not in self.pseudo_het_atoms:
                self.pseudo_het_atoms.append(ni)
            if not self.nextBranch(n, a, i + 1):  # reached th resold
                return False
        return True  # Went through the branch w/o reaching threshold

    def groups(self):
        return self._newhets

    def __iter__(self):
        for mol in self._newhets:
            yield mol


def _is_metal(atom):
    """ Returns boolean: whether atom is a metal or not """
    return atom.atomic_number in metal_charges


[docs]def find_hets(st,
              include_metals=True,
              include_hydrogens=False,
              excluded_hets=None):
    """
    Finds all het groups in the st, and returns them as a list of atom lists.
    (each atom list is a list of atom indecies)

    This module defines a "het" as a group of atoms which are bound to each
    other and are not in any standard residues. This includes ligands
    (including covalently bound ones), ions, cofactors, etc.

    The list includes heavy atoms only by default

    include_metals:    whether to consider single metal atoms as het groups
    include_hydrogens: whether to include het's hydrogen atoms in the list
    excluded_hets:     a list of residue names to not consider het groups
    """

    all_het_atoms = []
    for a in st.atom:
        # According to specification PDB residue names should always contain 3 characters
        # PDB atom names, on the other hand, are 4 characters long
        pdbres = a.pdbres.strip()  # Strips space, if specified as 4 characters.

        # Add (or skip) single metal atoms:
        num_non_zob_bonds = 0
        for b in a.bond:
            if b.order != 0:  # Ev:69319
                num_non_zob_bonds += 1

        if _is_metal(a) and num_non_zob_bonds == 0:
            if include_metals:
                all_het_atoms.append(int(a))
            continue

        non_het = (pdbres in non_het_resnames)
        excluded_het = (excluded_hets and pdbres in excluded_hets)
        if not non_het and not excluded_het:
            # Skip hydrogens if include_hydrogens == False:
            if include_hydrogens or a.atomic_number != 1:
                all_het_atoms.append(int(a))

    # Group the untemplated atoms into molecules by connectivity:
    pre_groups = []

    for het_atoms in analyze.group_by_connectivity(st, all_het_atoms):
        # FIXME: MAX_HET_SIZE should be compared to number of HEAVY atoms
        if len(het_atoms) > MAX_HET_SIZE:  # Skip huge hets
            continue
        if len(het_atoms) == 1:  # Skip waters:
            if st.atom[het_atoms[0]].atomic_number == 8:
                continue
        pre_groups.append(het_atoms)

    hets = _AddNeighborGroups(st, pre_groups).groups()

    return hets


#EOF