Source code for schrodinger.application.prepwizard2.prepare

"""
Shared functionality between PrepWizard GUI and command-line PrepWizard.

Copyright Schrodinger, LLC. All rights reserved.
"""
# Note: this file must not import any gui-dependent modules (QtGui, QtWidgets,
# maestro_ui) so the tasks can run without display dependency

import base64
import enum
import itertools
import json
import os
import sys
import zlib

import schrodinger
from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.models import parameters
from schrodinger.protein import biounit
from schrodinger.protein import findhets
from schrodinger.protein import getpdb
from schrodinger.protein import annotation
from schrodinger.structutils import analyze
from schrodinger.structutils import build
from schrodinger.structutils import measure
from schrodinger.structutils.interactions import hbond
from schrodinger.ui.qt.appframework2 import application
from schrodinger.utils import mmutil

from . import tasks as pwtasks

maestro = schrodinger.get_maestro()


[docs]class HetType(enum.Enum):
    METAL_OR_ION = enum.auto()
    DETECTED_LIGAND = enum.auto()
    NON_WATER_SOLVENT = enum.auto()
    OTHER = enum.auto()


# List of PDB residue names to NOT run Epik on:
EPIK_EXCLUSION_LIST = ['HEM', 'HEC']

STATE_PANELTY_PROP = 'r_epik_State_Penalty'
METAL_PENALTY_PROP = 'r_epik_Metal_State_Penalty'
ORIG_ANUM_PROP = 'i_ppw_anum'
HET_NUM_PROP = 'i_ppw_het'
HET_STATES_PROP = 's_ppw_het_states'

# Default weight of each hydrogen bond in the state score:
HBOND_ENERGY = -1.0  # kcal/mol/h-bond

# Penalize het states where neutral amide is interacting with a metal. This
# number must be large enough to overpower any favorable differences in Epik
# state penalty that the neutral (or positive) charged state may have.
METAL_AMIDE_CLASH_PENALTY = 20.0

# Consider NH...O as twice stronger than other H-bonds:
# https://doi.org/10.1021/jp103470e p.9534:
# "... Nevertheless, the hydrogen bond fit energies of all fit
# functions showed the general expected order4 of: ENH ··· O ≈
# -6 kJ/mol > ENH ··· N ≈ ECH ··· O ≈ -3 kJ/mol. ..."
HBOND_ENERGY_NHO = -2.0  # kcal/mol/h-bond

RANK_NH_O_HIGHER = mmutil.feature_flag_is_enabled(
    mmutil.RANK_NH_O_HIGHER_IN_PPW)

PDBNAME_ELEMENT_DICT = {}
for pdbname in [' OD1', ' OE1', ' OD2', ' OE2']:
    PDBNAME_ELEMENT_DICT[pdbname] = "O"
for pdbname in [' ND1', ' ND2', ' NE2']:
    PDBNAME_ELEMENT_DICT[pdbname] = "N"
for pdbname in [' CG ', ' CD ', ' CD2', ' CE1']:
    PDBNAME_ELEMENT_DICT[pdbname] = "C"


[docs]def retrieve_and_read_pdb(pdbid):
    """
    Retrieve the given PDB and return the Structure object.
    On error, raises RuntimeError.
    """
    try:
        # First attempt to retrieve from third-party database, then
        # attempt download:
        pdb_file = getpdb.get_pdb(pdbid)
        if not os.path.isfile(pdb_file):
            raise Exception("getpdb module did not write out the file %s" %
                            pdb_file)
    except Exception as err:
        msg = "Failed to retrieve or download PDB file for ID: %s" % pdbid
        raise RuntimeError(msg)

    # NOTE: For PDBReader, all_occ is already True by default. The
    # InfraReader that is used to read CIF files does not have this option.
    try:
        st = next(structure.StructureReader(pdb_file))
    except Exception as err:
        raise RuntimeError(str(err))
    return st


# Dictionary that defines possible ionization states for metals:
# Key is the atomic number.
METAL_AND_ION_CHARGES = {
    11: [1],  # NA Sodium  +1
    12: [2],  # MG Magnesium +2
    19: [1],  # K  Potassium +1
    20: [2],  # CA Calcium +2
    25: [2, 3, 4, 5, 6, 7],  # MN Manganese +2, +3, +4, +5, +6, +7
    26: [2, 3],  # FE Iron +2 +3
    27: [2, 3],  # CO Cobalt +2 +3
    28: [2, 3],  # NI Nickel +2 +3
    29: [1, 2, 3],  # CU Copper +1 +2 +3
    30: [2],  # ZN Zinc +2
    #   48:[2], # CD Cadmium +2 No MacroModel parameters
    #   80:[1, 2]  HG Mercury +1 +2  No MacroModel parameters
}

epik_exe = os.path.join(os.environ['SCHRODINGER'], 'epik')
impref_exe = os.path.join(os.environ['SCHRODINGER'], 'utilities', 'impref')
protassign_exe = os.path.join(os.environ['SCHRODINGER'], 'utilities',
                              'protassign')
if sys.platform == 'win32':
    epik_exe += ".exe"
    impref_exe += ".exe"
    protassign_exe += ".exe"


def _count_bonds(a):
    """
    Count number of bonds to the given StructureAtom (double bonds = 2 bonds),
    zero-order bonds are not counted.
    """
    total_orders = 0
    for bond in a.bond:
        order = bond.order
        total_orders += order
    return total_orders


[docs]def serialize_het_states(states_by_het, complex_st):
    """
    Set JSON string for all given states as a CT-level property.
    """
    # Keys are het nums (as str, as required by JSON module), values are
    # lists of hets. Het number is included so that restoring is possible
    # even if some hets were later removed by the user (i_ppw_het
    # property would remain unchanged).
    states_by_het = {
        str(hetnum): [s.serialize() for s in states
                     ] for hetnum, states in states_by_het.items()
    }

    json_str = json.dumps(states_by_het)
    compressed_str = zlib.compress(bytes(json_str, 'utf-8'))
    encoded_str = base64.b64encode(compressed_str).decode('utf-8')
    complex_st.property[HET_STATES_PROP] = encoded_str


[docs]def deserialize_het_states(complex_st):
    """
    Attempt to read the het states from the CT using the s_ppw_het_states
    property, if present.
    """
    encoded_str = complex_st.property.get(HET_STATES_PROP)
    if not encoded_str:
        # No states to load
        return []

    compressed_str = base64.b64decode(encoded_str)
    json_str = zlib.decompress(compressed_str)
    states_dict = json.loads(json_str)

    # In JSON, all keys are str, to cast to int:
    all_hets_and_states = {int(hetnum): v for hetnum, v in states_dict.items()}

    return all_hets_and_states


[docs]class HetState:
    """
    Class representing an ionization/tautomeric state of a het (e.g. ligand).
    """

[docs]    def __init__(self, hetnum):
        self.hetnum = hetnum  # Het number (value of i_ppw_het properties)
        self.charges = None  # items=(original anum, formal charge)
        self.orders = None  # items=(orig anum1, orig anum2, order)
        self.penalty = None  # Epik state penalty; 0.0 for metals
        self.metal_penalties = None  # keys=orig anum, values=metal penalty
        self.score = None  # Score of the het state, in context of protein
        self.info_str = None  # Information string explaining state details,
        # including num hbonds to receptor.

[docs]    @classmethod
    def initFromEpikOutput(cls, hetnum, state_st):
        """
        Return a HetState instance for the given Epik output structure.
        """
        self = cls(hetnum)
        self.charges, self.orders = self.extractStateChargesAndOrders(state_st)

        state_penalty, metal_penalties = get_state_penalties(state_st)

        self.penalty = state_penalty  # Will be 0.0 for metals
        self.metal_penalties = metal_penalties
        # NOTE: score and info_str will be calculated when state is applied
        # to the complex CT
        return self

[docs]    @classmethod
    def initFromSerializedState(cls, hetnum, serialized_state):
        """
        Return a HetState instance for the given serialized state.
        """
        self = cls(hetnum)
        (self.charges, self.orders, self.penalty, self.metal_penalties,
         self.score, self.info_str) = serialized_state
        # Convert atom number keys back to ints. This is needed because all
        # keys are loaded as strings when reading JSON:
        self.metal_penalties = {
            int(anum): penalty
            for anum, penalty in self.metal_penalties.items()
        }
        return self

[docs]    def serialize(self):
        # JSON requires keys to be strings:
        return (self.charges, self.orders, self.penalty, self.metal_penalties,
                self.score, self.info_str)

[docs]    def extractStateChargesAndOrders(self, state_st):
        hetnum = self.hetnum

        # Dict of charge information for each atom in the output state.
        # Keys are original/input atom indices:
        charge_data = []
        for atom in state_st.atom:
            # Iterate over all atoms in this Epik output CT:
            anum = atom.property.get(ORIG_ANUM_PROP)

            # Skip atoms that were not in the original het. This includes the
            # protein atoms covalently bound to the het, and hydrogens that
            # were added by Epik.
            if atom.property.get(HET_NUM_PROP) != hetnum:
                continue

            # If we got here then we are processing a heavy atom that is part
            # of the het - so it must have i_ppw_anum set, as Epik doesn't
            # touch heavy atoms.
            assert anum, "ERROR: No i_ppw_anum property for atom"
            charge_data.append((anum, atom.formal_charge))

        order_data = []
        for bond in state_st.bond:
            atom1 = bond.atom1
            atom2 = bond.atom2
            if atom1.property.get(HET_NUM_PROP) != hetnum or atom2.property.get(
                    HET_NUM_PROP) != hetnum:
                # Hydrogen or covalently bound protein atom:
                continue

            anum1 = atom1.property.get(ORIG_ANUM_PROP)
            anum2 = atom2.property.get(ORIG_ANUM_PROP)
            assert anum1 is not None and anum2 is not None

            order_data.append((anum1, anum2, bond.order))

        return charge_data, order_data

[docs]    def applyState(self, st):
        """
        Apply this het state to the specified complex structure: modify
        the bond orders and formal charges to apply the currently selected
        het state.

        Also calculates the score and info_str properties

        Raises RuntimeError if any of the het atoms are no longer in the
        structure, or are not numbered correctly.
        """

        # Delete hydrogen neighbors before adding them according to next state:
        het_atom_indices = self.findHetAtoms(st)
        hydrogens = hydrogen_neighbors(st, het_atom_indices)
        st.deleteAtoms(hydrogens)

        # Need to find het atoms again, as it's possible that removing
        # hydrogens has altered atom ordering:
        het_atom_indices = self.findHetAtoms(st)

        # Keys are original atom indices (prior to pre-processing), values are
        # atom objects, for hets that are still present in the structure.
        atoms_by_anum = {}
        for anum in het_atom_indices:
            atom = st.atom[anum]
            # Iterate over het heavy atoms in the Workspace CT:
            # This atom's atom number in input (unprepared) complex CT:
            anum = atom.property.get(ORIG_ANUM_PROP)
            assert anum is not None
            atoms_by_anum[anum] = atom

        expecting_anums = {anum for (anum, _) in self.charges}
        for anum1, anum2, _ in self.orders:
            expecting_anums.add(anum1)
            expecting_anums.add(anum2)
        missing_atoms = expecting_anums.difference(atoms_by_anum.keys())
        if missing_atoms:
            raise RuntimeError(
                'Could not apply het state (%i atoms missing or renumbered)' %
                len(missing_atoms))

        # Change the bond orders and charges as necessary:
        for anum, charge in self.charges:
            atom = atoms_by_anum[anum]
            atom.formal_charge = charge
            if charge > 0:
                charge_str = '+%i' % charge
            elif charge < 0:
                charge_str = '%i' % charge
            else:
                charge_str = ''
            atom.label_user_text = str(charge_str)
            atom.label_format = "%UT"
            atom.label_color = 13

        for anum1, anum2, order in self.orders:
            atom1 = atoms_by_anum[anum1]
            atom2 = atoms_by_anum[anum2]
            bond = st.getBond(atom1, atom2)
            bond.order = order

        num_atoms_pre_htreat = st.atom_total
        # Add hydrogens according to the new bond orders and formal charges:
        st.retype()
        build.add_hydrogens(st, atom_list=het_atom_indices)

        # Hide the new hydrogens (so that they are not flickered when selecting
        # state):
        for anum in range(num_atoms_pre_htreat + 1, st.atom_total):
            st.atom[anum].visible = False

        # Calculate the total score for this state. The score may have changed
        # since the states were generated by Epik due to user's changes - for
        # example removing an interacting metal or water molecule.
        self.score, self.info_str = calc_state_score(st, het_atom_indices,
                                                     self.penalty,
                                                     self.metal_penalties)

[docs]    def findHetAtoms(self, complex_st):
        """
        Return a list of het heavy atoms in the given complex structure.
        """
        asl = "(atom.%s %i)" % (HET_NUM_PROP, self.hetnum)
        return analyze.evaluate_asl(complex_st, asl)


class _CommonProblemsFixer:
    """
    Class for fixing common structure problems.
    """

    def _hasZobs(self, atom):
        """
        Return True if any of the bonds of this atom are zero-order.
        """
        for bond in atom.bond:
            if bond.order == 0:
                return True
        return False

    def _setFormalCharge(self, atom, new_charge):
        """
        Set the formal charge for the given atom. Logs the change in the
        self._corrections_list.
        """
        prev_formal_charge = atom.formal_charge
        if prev_formal_charge == new_charge:
            return
        atom.formal_charge = new_charge
        msg = "Changed atom %i (%s) charge from %i to %i" % (
            atom.index, atom.element, prev_formal_charge, new_charge)
        self._corrections_list.append(msg)

    def _deprotCarbAcid(self, carbon):
        """
        De-protonate the specified carboxylic acid.

        :type carbon: `structure._StructureAtom` object
        :param carbon: The carbon of the carboxylic acid.
        """
        # NOTE: This is no longer needed due to CONV-855/CONV-860 fix

        oxygen_double = None
        oxygen_single = None
        for bond in carbon.bond:
            atom2 = bond.atom2
            if atom2.element == "O":
                if bond.order == 2:
                    oxygen_double = atom2
                else:
                    oxygen_single = atom2
        if not oxygen_double or not oxygen_single:
            # If this ever happens, then this is not a carboxylic acid, or the
            # residue has missing atoms. Skip this group.
            return
        self._setFormalCharge(oxygen_single, -1)

        # Delete the hydrogen bound to the oxygen (if any):
        for other_atom in oxygen_single.bonded_atoms:
            if other_atom.element == "H":
                self._atoms_to_delete.append(other_atom.index)
                correction = (
                    "Deleted hydrogen %i" % other_atom.index +
                    " because it was interfering with a zero-order bond")
                self._corrections_list.append(correction)

    def _fixElement(self, atom):
        """
        If an atom's element (Whether it's an oxygen, nitrogen, or carbon) does
        not correspond to its PDB name, it gets reset. (Ev:69207)
        """
        expected_element = PDBNAME_ELEMENT_DICT.get(atom.pdbname)
        if expected_element:
            prev_element = atom.element
            if prev_element != expected_element:
                correction = "Changed atom %i (%s) element from %s to %s" % (
                    atom.index, atom.pdbname, prev_element, expected_element)
                atom.element = expected_element
                self._corrections_list.append(correction)

    def _processSulfur(self, atom):
        """
        Ev:105338 Fix formal charges on sulfurs
        """
        all_neighbors_are_carbons = True
        total_orders = 0
        for bond in atom.bond:
            order = bond.order
            total_orders += order
            if bond.atom2.element != "C":
                all_neighbors_are_carbons = False
        if total_orders == 3 and all_neighbors_are_carbons:
            prev_formal_charge = atom.formal_charge
            if prev_formal_charge != 1:
                self._setFormalCharge(atom, +1)

    def _processBackboneOxygen(self, atom):
        """
        PPREP-875 Remove extra bonds when residue oxygens are bound to covalent
        ligands
        """
        # R-O=R will become R-O-R
        if _count_bonds(atom) >= 3 and atom.formal_charge <= 0:
            # This oxygen is making one too many bonds. Remove one.
            # We ignore R-O(+)=R, which is valid.
            if len(atom.bond) != 2:
                return
                # For now we handle only the case where one bond is single
                # and one bond is double.
            # Find the double bond and make it single:
            for bond in atom.bond:
                if bond.order >= 2:
                    bond.order = 1
                    res = str(bond.atom1.getResidue())
                    correction = "Modified bond %i-%i (%s): Set order to 1" % \
                        (bond.atom1, bond.atom2, res)
                    self._corrections_list.append(correction)
                    break

    def run(self, st):
        """
        Fix common structure problems in the given Structure.

        :rtype: list of strings
        :return: A list of texts describing what changes were made.
        """
        self._atoms_to_delete = []
        self._corrections_list = []

        for atom in st.atom:
            self._fixElement(atom)

            if atom.element == "S":
                self._processSulfur(atom)

            # Ev:108985
            elif atom.element == "B":
                if _count_bonds(atom) == 4 and atom.formal_charge == 0:
                    self._setFormalCharge(atom, -1)

            # Ev:120900 Delete hydrogens with more than one non-ZOB bond (PDB error)
            elif atom.element == "H":
                num_bonds = 0
                for bond in atom.bond:
                    if bond.order > 0:
                        num_bonds += 1
                if num_bonds > 1:
                    self._atoms_to_delete.append(atom.index)
                    correction = "Deleted atom %i: This hydrogen had invalid number of bonds (%i)" % (
                        atom.index, num_bonds)
                    self._corrections_list.append(correction)

            elif atom.pdbname == " O  ":
                self._processBackboneOxygen(atom)

            elif atom.pdbname in (" OD1", " OD2"):
                # PPREP-1092 Find carboxylic acids with ZOBs
                # Use pdbnames instead of elements as an optimization (since
                # fewer atoms will have these names, while there are many Os).
                # NOTE: This is no longer needed due to CONV-855/CONV-860 fix
                if self._hasZobs(atom):
                    for bond in atom.bond:
                        if bond.order != 0 and bond.atom2.element == "C":
                            self._deprotCarbAcid(bond.atom2)

            # PPREP-943 Fix formal charges on some irons:
            # (FIXME perhaps make this correction more broad?)
            elif atom.element == "Fe":
                if atom.pdbres == "SRM " and atom.pdbname == "FE  ":
                    self._setFormalCharge(atom, +2)

        if self._atoms_to_delete:
            st.deleteAtoms(self._atoms_to_delete)

        st.retype()
        return self._corrections_list


[docs]def fix_common_structure_mistakes(st):
    """
    Fixes common problems in structures and returns a list of correction strings
    that are to be reported to the user.
    """

    return _CommonProblemsFixer().run(st)


[docs]def atomsasl(atoms):
    """ Generates an ASL expression for the specified atoms. """
    if len(atoms) == 0:
        raise Exception("ERROR: empty atom list passed to atomsasl()!")
    asl = "(atom.num %s)" % ', '.join(map(str, atoms))
    return asl


def _create_bonds_by_smarts(st,
                            bond_type,
                            smarts1,
                            smarts2,
                            dist,
                            verbose=False):
    """
    General function to create bonds between the first atom in two
    different smarts patterns if they are closer then a distance. The smarts
    matching is performed on a per residue basis.
    :param st: Structure to modify.
    :type st: Schrodinger.structure

    :param bond_type: Bond type to be used for reporting
    :type bond_type: String

    :param smarts1: The first smarts pattern to consider, by convention
                    this should be the protein if that applies
    :type bond_type: String

    :param smarts2: The second smarts pattern to consider, by convention
                    this should be the non-protein part if that applies
    :type bond_type: String

    :param dist: Atoms must be at least this close to consider for
       for glycosilation
    :type dist: float

    :param verbose: Whether to print formed bonds to stdout
    :type verbose: boolean

    :rparam: Pairs of atom ( in the output structure ) where
       bonds were added
    :rtype: list of schrodinger atom objects
    """
    output = []
    attach_list = [[], []]
    for res in st.residue:
        res_anums = res.getAtomIndices()
        res_ct = res.extractStructure()
        for smarts, attach in zip([smarts1, smarts2], attach_list):
            match = analyze.evaluate_smarts_canvas(res_ct, smarts)
            # Map residue atom ids back to main structure
            for a_match in match:
                anum_by_st = res_anums[a_match[0] - 1]
                attach.append(anum_by_st)
    smarts1_attach, smarts2_attach = attach_list
    if not smarts1_attach or not smarts2_attach:
        return output

    atoms_to_delete = []
    atoms_to_add_hyd = []
    used_atoms = set()
    atoms = smarts1_attach + smarts2_attach
    for iatom1, iatom2 in measure.get_close_atoms(st, dist, atoms=atoms):
        atom1 = st.atom[iatom1]
        atom2 = st.atom[iatom2]
        # Don't reuse atoms
        if atom1.index in used_atoms or atom2.index in used_atoms:
            continue
        # Identify close atoms that are not bound
        if st.areBound(atom1, atom2):
            continue
        # Check both ways
        for s1_atom, s2_atom in itertools.permutations([atom1, atom2]):
            if s1_atom.index not in smarts1_attach:
                continue
            if s2_atom.index not in smarts2_attach:
                continue
            st.addBond(s1_atom, s2_atom, 1)
            output.append((s1_atom, s2_atom))
            used_atoms.add(s1_atom.index)
            used_atoms.add(s2_atom.index)
            # Keep track of the hydrogens to delete on each side
            # and which atoms to add hydrogens to.  This will all be
            # done at the end to avoid messing up indexes
            for atom in s1_atom, s2_atom:
                for ba in atom.bonded_atoms:
                    if ba.element == "H":
                        atoms_to_delete.append(ba.index)
                        atoms_to_add_hyd.append(atom.index)
            if verbose:
                print('   Created a %s bond between %s and %s' %
                      (bond_type, "%s:%d%s(%s):%s" %
                       (s1_atom.chain, s1_atom.resnum, s1_atom.inscode,
                        s1_atom.pdbres, s1_atom.pdbname), "%s:%d%s(%s):%s" %
                       (s2_atom.chain, s2_atom.resnum, s2_atom.inscode,
                        s2_atom.pdbres, s2_atom.pdbname)))

    # Note that since we are using structure Atom objects for output,
    # the indexes autoamtically update as the underlying CT is changed
    if len(atoms_to_delete) > 0:
        rmap = st.deleteAtoms(atoms_to_delete, renumber_map=True)
        atoms_to_add_hyd = [rmap[i] for i in atoms_to_add_hyd]
    if len(atoms_to_add_hyd) > 0:
        build.add_hydrogens(st, atom_list=atoms_to_add_hyd)
    return output


[docs]def create_glycosylation_bonds(st, dist=1.8, verbose=True):
    """
    Create glycosylation bonds for N-linked or O-linked glycosilation events
    Identfies neutral O or N with implicit or explicit hydrogens and
    forms bonds to sugars ( ring with 5 aliphatic carbons and one oxygen )
    at locations adjacent to the oxygen.
    :param st: Structure to modify.
    :type st: Schrodinger.structure

    :param dist: Atoms must be at least this close to consider for
       for glycosilation
    :type dist: float

    :param verbose: Whether to print formed bonds to stdout
    :type verbose: boolean

    :rparam: Pairs of atom ( in the output structure ) where
       bonds were added
    :rtype: list of schrodinger atom objects
    """
    # ( note that only NH1 is checked for for ARG as NH2 is positively
    # charged.
    return _create_bonds_by_smarts(
        st,
        "glycosylation",
        "[N,O;+0;h,H1,H2]",
        "[C;r6;H2,H1,h][O;r6][C;r6][C;r6][C;r6][C;r6]",
        dist,
        verbose=verbose)


[docs]def create_palmitoylation_bonds(st, dist=1.8, verbose=True):
    """
    Create palmitoylation bonds.
    Identfies neutral cysteine S with implicit or explicit hydrogens and
    palmitoyl groups or palmitoyl groups with the OH of the acid replace by
    a hydrogen
    :param st: Structure to modify.
    :type st: Schrodinger.structure

    :param dist: Atoms must be at least this close to consider for
       for pamitoylation
    :type dist: float

    :param verbose: Whether to print formed bonds to stdout
    :type verbose: boolean

    :rparam: Pairs of atom ( in the output structure ) where
       bonds were added
    :rtype: list of schrodinger atom objects
    """
    return _create_bonds_by_smarts(st,
                                   "palmitoylation",
                                   "[S,O;+0;h,H1,H2]",
                                   "[C;+0;h,H1](=[O;+0])",
                                   dist,
                                   verbose=verbose)


[docs]def create_disulfide_bonds(st, dist=3.2, verbose=False):
    """
    Create bonds between proximal Sulfurs, deleting any hydrogens on them.
    If verbose is True, prints log info to the termnal.
    Returns a list of (atom1, atom2) for ever added bond.
    """

    cys_sulfurs = analyze.evaluate_asl(
        st,
        '((res.ptype "CYS ") OR (res.ptype "CYX ")) AND ((atom.ptype " SG "))')

    added_bonds = []  # Ev:92646
    atoms_to_delete = []
    for isulfur_index in cys_sulfurs:
        for jsulfur_index in cys_sulfurs:
            if jsulfur_index >= isulfur_index:
                continue

            isulfur = st.atom[isulfur_index]
            jsulfur = st.atom[jsulfur_index]

            # Ev:129699 Skip this pair if at least one is already bound to
            # something (perhaps from a previous iteration of this loop)
            pair_hydrogens = []
            skip = False
            for a in [isulfur, jsulfur]:
                num_heavy = 0
                for bond in a.bond:
                    if bond.order > 0:  # Ev:131161 Ignore zero-order bonds
                        n = bond.atom2
                        if n.element == 'H':
                            pair_hydrogens.append(n.index)
                        else:
                            num_heavy += 1
                if num_heavy > 1:
                    skip = True
            if skip:
                continue

            if st.measure(isulfur, jsulfur) <= dist:
                isulfur.addBond(jsulfur, 1)
                added_bonds.append((isulfur.index, jsulfur.index))
                isulfur.formal_charge = 0
                jsulfur.formal_charge = 0
                atoms_to_delete += pair_hydrogens

                ires = isulfur.getResidue()
                jres = jsulfur.getResidue()

                # Ev:88748 Change the resname of the combined residue to CYX:
                if ires.pdbres != "CYX " or jres.pdbres != "CYX ":
                    ires.pdbres = "CYX "
                    jres.pdbres = "CYX "
                    if verbose:
                        print(
                            '   Created a bond between %s and %s; changed residue names to "CYX "'
                            % (ires, jres))
                else:
                    if verbose:
                        print('   Created a bond between %s and %s' %
                              (ires, jres))

    # Must delete atoms at the end, in order to preserve atom numbers:
    st.deleteAtoms(atoms_to_delete)

    st.retype()

    return added_bonds


[docs]def convert_selenomethionines(st):
    """
    Convert MSE residues to METs.
    Returns a list of residue strings that were converted.
    """

    converted_residues = []
    for res in st.residue:
        if res.pdbres == 'MSE ':
            # Convert this Selenomethionine to a Methionine:
            res_str = str(res)
            converted_residues.append(res_str)
            for atom in res.atom:
                if atom.pdbname == 'SE  ':
                    atom.element = 'S'
                    atom.pdbname = ' SD '
                    atom.color = 'yellow'
            res.pdbres = 'MET'

    st.retype()

    return converted_residues


def _set_oxygen_orders(st, mainatom, double, single, charge):
    """
    Sets the orders of phosphate-oxygen or sulfate-oxygen bonds.

    mainatom - atom number of the phosphate or sulfur

    double - list of atom numbers of the oxygens to double-bond to

    single - list of atom numbers of oxygens to single-bond to, and set charge to <charge>.

    """

    st = st.copy()
    for anum in double:
        bond = st.getBond(mainatom, anum)
        bond.order = 2
        st.atom[anum].formal_charge = 0

    for anum in single:
        bond = st.getBond(mainatom, anum)
        bond.order = 1
        st.atom[anum].formal_charge = charge

    return st


def _generate_phosphate_states(st, atoms):
    """
    Returns a list of structures (expanded states)
    """
    # TODO move this code into the GenerateStatesTask class

    phosphate = atoms[0]
    double_oxygen = atoms[1]

    # Make a list of negative hydrogens and hydrogens with hydrogens:
    negative_oxygens = []
    hydrogen_oxygens = []

    # Iterate over all oxygens with single bond to Phosphate:
    for anum in atoms[2:]:
        if st.atom[anum].bond_total == 1:
            if st.atom[anum].formal_charge == -1:
                negative_oxygens.append(anum)
            else:
                hydrogen_oxygens.append(anum)

    # FIXME For now, do not deal with this Phosphate if it has any hydrogen
    # oxygens:
    if hydrogen_oxygens:
        return [st]

    if not negative_oxygens:
        # No states can be generated from this Phosphate
        return [st]

    # Alternate double bond between the oxygens:
    if len(negative_oxygens) == 1:
        st2 = _set_oxygen_orders(st, phosphate, [negative_oxygens[0]],
                                 [double_oxygen], -1)
        return [st, st2]
    elif len(negative_oxygens) == 2:
        st2 = _set_oxygen_orders(st, phosphate, [negative_oxygens[0]],
                                 [double_oxygen, negative_oxygens[1]], -1)
        st3 = _set_oxygen_orders(st, phosphate, [negative_oxygens[1]],
                                 [double_oxygen, negative_oxygens[0]], -1)
        return [st, st2, st3]
    else:
        print('ERROR: wrong number of negative oxygens:', len(negative_oxygens))
        return [st]


def _generate_sulfate_states(st, atoms):
    """
    Returns a list of structures (expanded states)
    """
    # TODO move this code into the GenerateStatesTask class

    sulfur = atoms[0]
    double_oxygen1 = atoms[1]
    double_oxygen2 = atoms[2]

    # Make a list of negative hydrogens and hydrogens with hydrogens:
    negative_oxygens = []
    hydrogen_oxygens = []

    # Iterate over all oxygens with single bond to Sulfur:
    for anum in atoms[3:]:
        if st.atom[anum].bond_total == 1:
            if st.atom[anum].formal_charge == -1:
                negative_oxygens.append(anum)
            else:
                hydrogen_oxygens.append(anum)

    # Alternate double bond between the oxygens:
    if len(negative_oxygens) == 1 and len(hydrogen_oxygens) == 0:
        # st1 is equivalent of:              [double_oxygen1, double_oxygen2], negative_oxygens[0]
        st2 = _set_oxygen_orders(st, sulfur,
                                 [double_oxygen1, negative_oxygens[0]],
                                 [double_oxygen2], -1)
        st3 = _set_oxygen_orders(st, sulfur,
                                 [double_oxygen2, negative_oxygens[0]],
                                 [double_oxygen1], -1)
        return [st, st2, st3]

    elif len(hydrogen_oxygens) == 1 and len(negative_oxygens) == 0:
        # st1 is equivalent of:              [double_oxygen1, double_oxygen2], hydrogen_oxygens[0]
        st2 = _set_oxygen_orders(st, sulfur,
                                 [double_oxygen1, hydrogen_oxygens[0]],
                                 [double_oxygen2], 0)
        st3 = _set_oxygen_orders(st, sulfur,
                                 [double_oxygen2, hydrogen_oxygens[0]],
                                 [double_oxygen1], 0)
        return [st, st2, st3]

    else:
        # No states can be generated from this Sulfate
        return [st]


[docs]def count_phosphates_and_sulfurs(st):

    # Find any phosphates:
    p_smarts = "P(=O)(O)(O)O"
    s_smarts = "S(=O)(=O)(O)O"

    p_matches = analyze.evaluate_smarts_canvas(st, p_smarts)
    s_matches = analyze.evaluate_smarts_canvas(st, s_smarts)

    main_atoms = set()
    for atoms in p_matches + s_matches:
        main_atoms.add(atoms[0])

    return len(main_atoms)


[docs]def extend_phosphate_states(st):
    """
    For specified structure, generates phosphate states, and returns list
    of output structures. Ev:78688

    NOTE: Output structure has no hydrogens.
    """
    # TODO move this code into the GenerateStatesTask class

    hydrogens = []
    for a in st.atom:
        if a.atomic_number == 1:
            hydrogens.append(a)
    st.deleteAtoms(hydrogens)

    # Find any phosphates:
    p_smarts = "P(=O)(O)(O)O"

    matches = analyze.evaluate_smarts_canvas(st, p_smarts)

    # Remove duplicates:
    matches_no_dups = []
    sorted_matches = []
    for match in matches:
        sorted_match = sorted(match)
        if sorted_match not in sorted_matches:
            sorted_matches.append(sorted_match)
            matches_no_dups.append(match)

    sts = [st]
    for atoms in matches_no_dups:
        all_extended_sts = []
        for st in sts:
            extended_sts = _generate_phosphate_states(st, atoms)
            all_extended_sts.extend(extended_sts)
        sts = all_extended_sts

    return sts


[docs]def extend_sulfate_states(st):
    """
    For specified structure, generates sulfate states, and returns list
    of output structures. Ev:82634
    """
    # TODO move this code into the GenerateStatesTask class

    hydrogens = []
    for a in st.atom:
        if a.atomic_number == 1:
            hydrogens.append(a)
    st.deleteAtoms(hydrogens)

    # Find any sulfates in the structure:
    s_smarts = "S(=O)(=O)(O)O"
    matches = analyze.evaluate_smarts_canvas(st, s_smarts)

    # Remove duplicates:
    matches_no_dups = []
    sorted_matches = []
    for match in matches:
        sorted_match = sorted(match)
        if sorted_match not in sorted_matches:
            sorted_matches.append(sorted_match)
            matches_no_dups.append(match)

    sts = [st]
    for i, atoms in enumerate(matches_no_dups):
        # For this sulfate, extend the states:
        all_extended_sts = []
        for j, st in enumerate(sts):
            extended_sts = _generate_sulfate_states(st, atoms)
            all_extended_sts.extend(extended_sts)
        sts = all_extended_sts

    # Add hydrogens, because hydrogens were added to some atoms:
    for st in sts:
        build.add_hydrogens(st)

    return sts


[docs]def get_chain_sequences(st, remove_tails=True):
    """
    Will read the PDB sequences from the sequence block, and will return a
    dictionary (keys: chain names; values: sequence strings).

    If remove_tails is True, will chop off tails that are not existent in
    the CT, but will leave in the missing loop sections.

    Will raise RuntimeError on an error, or mmerror on mmct failure.
    """

    st = st.copy()

    # Make a list of residues from the chain sequence that are in the CT:
    used_chain_indexes = {}  # key: chain; value: list of ints
    for chain in st.chain:
        chain_used_indexes = []
        for res in chain.residue:
            index = res.atom[1].property.get('i_pdb_seqres_index')
            if index is not None:
                chain_used_indexes.append(index)
        used_chain_indexes[chain.name] = chain_used_indexes

    # Get the PDB sequence strings for each chain:
    try:
        data_handle = mm.mmct_ct_m2io_get_unrequested_handle(st)
    except mm.MmException:
        data_handle = mm.mmct_ct_get_or_open_additional_data(st, True)

    chain_seq_dict = {}

    # Ev:124587 Do not attempt to read the sequence if it's not present:
    # will be 1 in sequence data exists:
    num_seqres_blocks = mm.m2io_get_number_blocks(data_handle, "m_PDB_SEQRES")
    if num_seqres_blocks == 0:
        raise RuntimeError("Input structure had no sequence block data")

    mm.m2io_goto_block(data_handle, "m_PDB_SEQRES", 1)

    num_rows = mm.m2io_get_index_dimension(data_handle)
    for i in range(1, num_rows + 1):
        chain_strings = mm.m2io_get_string_indexed(data_handle, i,
                                                   ["s_pdb_chain_id"])
        seqres_strings = mm.m2io_get_string_indexed(data_handle, i,
                                                    ["s_pdb_SEQRES"])
        chain = chain_strings[0]
        seqres = seqres_strings[0]
        chain_seq_dict[chain] = seqres

    chain_strings = list(chain_seq_dict)
    used_chains = list(used_chain_indexes)
    for chain in used_chains:
        if chain not in chain_strings:
            msg = 'WARNING: Chain "%s" did not have a sequence record' % chain
            print(msg)

    # Modify the sequence strings by chopping off tails:
    output_sequences = {}  # key: chain name; value: sequence string
    mm.mmseq_initialize(mm.error_handler)
    for chain, seqres in chain_seq_dict.items():
        if chain not in used_chains:
            # Ev:102107 Possibly the user has deleted this chain
            continue

        # Figure out what residues in this sequence are missing in the CT:
        used_indexes = used_chain_indexes[chain]
        # used_indexes is 1-indexed

        # list of tuples: index=res index; value: (one letter code, whether it
        # is used)
        sequence_list = []
        numres = len(seqres) // 4
        for i in range(numres):
            beg = i * 4
            end = i * 4 + 4
            resname = seqres[beg:end]
            onelett = mm.mmseq_get_one_letter_res_name(resname)
            # Ev:86135 i is zero-indexed; while the list is 1-indexed:
            used = bool(i + 1 in used_indexes)
            sequence_list.append((onelett, used))

        in_sequence = ""
        for onelett, used in sequence_list:
            in_sequence += onelett

        if remove_tails:
            # Chop off the leading tail:
            while sequence_list:
                onelett, used = sequence_list[0]
                if used:
                    break
                else:
                    sequence_list.pop(0)

            # Chop off the trailing tail:
            while sequence_list:
                onelett, used = sequence_list[-1]
                if used:
                    break
                else:
                    sequence_list.pop(-1)

        if not sequence_list:
            raise RuntimeError("All of sequence for chain %s was missing" %
                               chain)

        # Convert to a sequence string (excluding tails):
        out_sequence = ""
        for onelett, used in sequence_list:
            out_sequence += onelett
        output_sequences[chain] = out_sequence

    mm.mmseq_terminate()

    return output_sequences


[docs]def write_sequences_to_fasta(pdbid, sequences_dict, fastafile):

    pdbid = pdbid.upper()

    fh = open(fastafile, 'w')
    for chain, sequence in sequences_dict.items():
        numres = len(sequence)
        line1 = ">%s_%s, %i bases,\n" % (pdbid, chain, numres)
        line2 = sequence + '\n'

        fh.write(line1)
        fh.write(line2)

    fh.close()


[docs]def does_res_have_missing_side_chains(residue):
    """
    Given a _Residue object, returns True if the residue is missing
    side-chain atoms.  If at least one backbone atom is (also) missing,
    False is returned.

    Basically only residues for which Prime missing-side-chains job can
    be run will return True.
    """

    # Ev:96128 Use a new mechanism to determine which residues have missing
    # atoms. This method does NOT rely on the i_m_pdb_convert_problem
    # property, which may be out-of-date:

    # DPB names for the residue backbone atoms:
    backbone_names = {" C  ", " CA ", " N  ", " O  "}

    # Do not show DNA/RNA residues, because we can't fill missing side-chains
    # on them:
    pdbres = residue.pdbres
    if pdbres in [
            '  A ', '  C ', '  G ', '  U ', ' DA ', ' DC ', ' DG ', ' TD '
    ]:
        return False

    if not residue.hasMissingAtoms():
        return False

    heavy_elements = []
    heavy_atom_names = set()
    for atom in residue.atom:
        if atom.atomic_number > 1:
            heavy_elements.append(atom.element)
            heavy_atom_names.add(atom.pdbname)

    # Ev:116225 Ignore residues that have been modified to -NH2 caps:
    if heavy_elements == ["N"]:
        return False

    # Ev:126307 Ignore residues that have been modified to -NH-CH3 caps:
    if sorted(heavy_elements) == ["C", "N"]:
        return False

    # PPREP-794 Ignore residues that are missing any backbone atoms:
    # (Note that this makes the two tests above unnecessary)
    if not heavy_atom_names.issuperset(backbone_names):
        return False

    # If got here, the reisude has missing side-chain atoms:

    return True


[docs]def do_any_residues_have_missing_side_chains(st):
    """
    Returns True if at least one of the residue in the given structure
    has missing side-chain atoms (backbone atoms are ignored).
    """

    for res in st.residue:
        if does_res_have_missing_side_chains(res):
            return True
    return False


[docs]def fix_sulfur_charges(st):
    """
    Post process by fixing the charge on zero-order-bonded Sulfurs
    Gives -1 or -2 charge to Sulfurs as appropriate.
    Deletes a hydrogen from Sulfurs coordinating with metals
    (Ev:61622)
    """

    hydrogens_to_delete = []
    for sulfur in st.atom:
        if sulfur.element != 'S':
            continue

        bonded_to_iron = False
        # Bonds for non-metal neighbors to this Sulfur (excluding hydrogens)
        reg_bonds = 0
        zob_bonds = 0  # How many bonds to metals
        hydrogens = []  # Hydrogen atoms bonded to this Sulfur
        for bond in sulfur.bond:
            bo = bond.order
            n = bond.atom2
            if n.element == 'Fe':
                # Set charge of Iron in FeS clusters to +2
                n.formal_charge = +2
                bonded_to_iron = True
            if bo == 0:
                zob_bonds += 1
            elif n.element == 'H':
                hydrogens.append(n)
            else:  # regular bond to non-hydrogen atom
                reg_bonds += bo

        # Number of hydrogens depends on number of regular bonds + number of zobs:
        if reg_bonds + zob_bonds == 0:
            max_hydrogens = 2
        elif reg_bonds + zob_bonds == 1:
            max_hydrogens = 1
        elif reg_bonds + zob_bonds == 2:
            max_hydrogens = 0
        else:
            # This is an unexpected hybridization state. Do not delete any
            # hydrogens (PPREP-694). Unless this sulfur is bond to an iron,
            # in which case we will still delete all hydrogens from it, and will
            # adjust the formal charge (PPREP-944).
            if bonded_to_iron:
                max_hydrogens = 0
            else:
                # Unknown hybridization state and NOT bound to an iron; skip:
                continue

        # Delete all hydrogens that are above the number allowed:
        while len(hydrogens) > max_hydrogens:
            hydrogens_to_delete.append(hydrogens.pop(0))

        kept_hydrogens = len(hydrogens)

        # The charge depends on the number of regular bonds (zobs excluded):
        if kept_hydrogens + reg_bonds == 0:
            sulfur.formal_charge = -2
        elif kept_hydrogens + reg_bonds == 1:
            sulfur.formal_charge = -1
        elif kept_hydrogens + reg_bonds == 2:
            sulfur.formal_charge = 0

    if hydrogens_to_delete:
        st.deleteAtoms(hydrogens_to_delete)

    st.retype()  # Correct the MacroModel atom types.


[docs]def extract_het_from_complex(complex_st, het_atoms):
    """
    Extract the het defined by the given ASL from the complex structure, and
    include receptor atoms withing 2 bonds (for covalently bound ligands).

    :param complex_st: Receptor-ligand complex structure
    :type complex_st: structure.Structure

    :param het_atoms: List of atom indices for the het
    :type het_atoms: list(int)

    :return: Het structure, Het structure plus some receptor atoms (covalent
             ligands only), and het type.
    :rtype: (structure.Structure, structure.Structure, HetType)
    """
    # Get Structure for het atoms:
    het_asl = atomsasl(het_atoms)
    het_atoms = analyze.evaluate_asl(complex_st, het_asl)
    hetnum = complex_st.atom[het_atoms[0]].property[HET_NUM_PROP]
    het_only_st = complex_st.extract(het_atoms)

    # Get Structure for het atoms and nearby atoms bonded to it:
    near_het_asl = "(withinbonds 2 (%s))" % het_asl
    near_heavyatoms = analyze.evaluate_asl(complex_st, near_het_asl)
    near_heavyatoms_st = complex_st.extract(near_heavyatoms)
    near_heavyatoms_st.property['i_ppw_hetnum'] = hetnum  # For records

    # Add hydrogens to the near-het atoms, so that Epik does not
    # produce errors regarding their lewis structure:
    extra_atoms_asl = "not (%s) and (not atom.ele H)" % het_asl
    extra_atoms_list = analyze.evaluate_asl(near_heavyatoms_st, extra_atoms_asl)
    build.add_hydrogens(near_heavyatoms_st, atom_list=extra_atoms_list)

    if het_only_st.atom_total == 1 and het_only_st.atom[
            1].atomic_number in METAL_AND_ION_CHARGES:
        het_type = HetType.METAL_OR_ION
    elif analyze.evaluate_asl(het_only_st, 'ligand'):
        het_type = HetType.DETECTED_LIGAND
    elif analyze.evaluate_asl(het_only_st, 'solvent'):
        het_type = HetType.NON_WATER_SOLVENT
    else:
        het_type = HetType.OTHER

    return het_only_st, near_heavyatoms_st, het_type


[docs]def extract_hets_from_complex(complex_st, het_atoms_lists):
    """
    Detect hets in the given complex structure, and extract them into
    separate substructures.  For covalent ligands, receptor atoms within
    2 bonds are also retained, so that Epik can account for them.

    :param complex_st: Receptor-ligand complex structure
    :type complex_st: structure.Structure

    :param het_atoms_lists: List of atoms for all hets
    :type het_atoms_lists: list(list(int))

    :return: Yields tuples of: Het structure, Het structure plus some
             receptor atoms (covalent ligands only), and het type.
    :rtype: generator(structure.Structure, structure.Structure, HetType)
    """

    # Delete any zero-order-bonds (Ev:79718). Needed in order for Epik
    # to generate states correctly for zero-order-bonded ligands. This
    # deletion
    # is only done to the temporary copy of the complex structure.
    complex_st = complex_st.copy()
    for atom in complex_st.atom:
        num_bonds = len(atom.bond)
        # Reverse order so that bond deletion does not affect the loop:
        for i in range(num_bonds, 0, -1):
            bond = atom.bond[i]
            if bond.order == 0:
                complex_st.deleteBond(atom, bond.atom2)

    for het_atoms in het_atoms_lists:
        het_st, state_st, het_type = extract_het_from_complex(
            complex_st, het_atoms)
        yield het_st, state_st, het_type


[docs]def prepare_for_epik(complex_st, types_to_process):
    """
    Extract hets from the given complex structure, (except het types that user
    requested to skip), and separate retained hets into 2 lists: Those that
    Epik should be run on, those that Epik doesn't produce any states for.
    Atoms in the original structure will be marked with i_ppw_anum
    property, for matching Epik output to atoms in the input structure.

    :param complex_st: Receptor-ligand complex structure
    :type complex_st: structure.Structure

    :param types_to_process: List of het types should be processed
    :type types_to_process: [HetType]

    :return: List of structures to run Epik on, and list of structures that
             should be processed, but without Epik.
    :rtype: [structure.Structure], [structure.Structure]
    """
    # NOTE: Whether states are skipped here or not had no affect on
    # whether they are shown in the GUI's Substructures tab - it will
    # always show all hets, even if they have no states generated.

    # Set the hetero atom number property on the hetero atoms in the stucture,
    # so Epik results can be matched to the atoms in the complex structure.

    het_atoms_lists = findhets.find_hets(complex_st)
    if not het_atoms_lists:
        return [], []
    print("Processing %i het groups..." % len(het_atoms_lists))

    # Annotate the structure with properties, so that Epik results can
    # be mapped back to the input structure:
    for atom in complex_st.atom:
        atom.property[ORIG_ANUM_PROP] = atom.index
    for hetnum, het_atoms in enumerate(het_atoms_lists, start=1):
        # This is required by prepare_for_epik() function now:
        for atomnum in het_atoms:
            complex_st.atom[atomnum].property[HET_NUM_PROP] = hetnum

    sts_to_run_epik_on = []
    sts_to_not_run_epik_on = []

    for het_st, state_st, het_type in extract_hets_from_complex(
            complex_st, het_atoms_lists):
        # NOTE: For non-covalent ligands, het_st is same as state_st.

        # Hets that don't match user's criteria are skipped completely
        if not het_type in types_to_process:
            continue

        # Epik will be run on some groups, but is not run on groups that
        # have valence violations, metals, ions, or excluded groups.
        skip_epik = (het_type == HetType.METAL_OR_ION or any(
            a.pdbres.strip() in EPIK_EXCLUSION_LIST for a in state_st.atom))
        if skip_epik:
            sts_to_not_run_epik_on.append(state_st)
        else:
            sts_to_run_epik_on.append(state_st)

    return sts_to_run_epik_on, sts_to_not_run_epik_on


[docs]def filter_undesired_states(orig_st, state_sts):
    """
    Returns a subset of state structures, which excludes metal-binding
    states for hets that are not within 5A of a metal.

    :param orig_st: Original complex structure (receptor, ligands, metals)
    :type orig_st: `structure.Structure`

    :param state_st: Epik output states to filter.
    :type state_st: Iterable of `structure.Structure`

    :return: List of filtered structures.
    :rtype: List of `structure.Structure`
    """

    filtered_states = []

    # For each het, whether it's within 5A of a metal:
    is_near_metal_dict = {}

    for state_st in state_sts:
        # Determine whether this is a metal-only state that should be skipped:
        # NOTE: If this property is not present, then Epik failed to generate
        # states for this structure - which means the output state is
        # effectively NOT "Metal only", and should be kept.
        if state_st.property.get('b_epik_Metal_Only'):
            hetnum = state_st.property['i_ppw_hetnum']
            if hetnum not in is_near_metal_dict:
                asl = "(atom.%s %i) AND (within 5 metals)" % (HET_NUM_PROP,
                                                              hetnum)
                matches = analyze.evaluate_asl(orig_st, asl)
                is_near_metal_dict[hetnum] = bool(matches)

            if not is_near_metal_dict[hetnum]:
                continue
        filtered_states.append(state_st)
    return filtered_states


[docs]def find_ppw_atom(st, anum):
    """
    Find the atom in the given structure whose i_ppw_anum property is set to
    the given value. ValueError exception is raised if such atom is not found.
    """
    matches = analyze.evaluate_asl(st, "(atom.%s %i)" % (ORIG_ANUM_PROP, anum))
    if not matches:
        raise ValueError("No atom with i_ppw_anum=%s was found" % anum)
    return st.atom[matches[0]]


[docs]def apply_state(complex_st, state_st):
    """
    Modify the het in complex_st complex (protein/ligand) structure such that
    its ionization state matches the output that we got from Epik (state_st).

    :param complex_st: Original complex structure (receptor + het)
    :type complex_st: `structure.Structure`

    :param state_st: Epik output for the het group. May include some atoms
            from the receptor if the ligand is covalently-bound.
    :type state_st: `structure.Structure`

    :return: List of atom indices in complex_st that are part of the het.
    :rtype: (dict, list)
    """

    heavy_atoms = []

    for state_atom in state_st.atom:
        if state_atom.element == 'H':
            continue  # Skip hydrogens
        state_anum = state_atom.property.get(ORIG_ANUM_PROP)
        if state_anum is None:
            # Skip protein atoms for covalently bound ligands
            continue

        complex_atom = find_ppw_atom(complex_st, state_anum)

        ahet = state_atom.property.get(HET_NUM_PROP)
        heavy_atoms.append(complex_atom.index)

        # Iterate over all bonds that this atom makes, and adjust them to
        # match Epik output:
        for bond in state_atom.bond:
            state_n_atom = bond.atom2
            if state_n_atom.element == 'H':
                continue  # Skip hydrogens
            neighbor_anum = state_n_atom.property.get(ORIG_ANUM_PROP)
            if neighbor_anum is None:
                # Skip protein atoms for covalently bound ligands
                continue

            nhet = state_n_atom.property.get(HET_NUM_PROP)

            # Skip this bond if the neighbor is NOT from the same het (it's
            # part of the protein to which the ligand is covalently bound - we
            # include some of the protein atoms in the Epik input).
            if nhet != ahet:
                continue

            # Modify the original full receptor structure:
            # Ev:96171 Determine which 2 atoms in <complex_st> correspond to this bond:
            neighbor_atom = find_ppw_atom(complex_st, neighbor_anum)
            complex_st.getBond(complex_atom, neighbor_atom).order = bond.order

        # Skip the formal charge if the atom is NOT part of the het (it's
        # part of the receptor to which the ligand is covalently bound - we
        # include some of the receptor atoms in the Epik input).
        if ahet is not None:
            complex_atom.formal_charge = state_atom.formal_charge

    # Need to re-type after changing bond orders and formal charges:
    complex_st.retype()

    # Delete and re-add het hydrogens in the complex structure:
    hydrogens_to_delete = hydrogen_neighbors(complex_st, heavy_atoms)

    renum_map = complex_st.deleteAtoms(hydrogens_to_delete, renumber_map=True)
    heavy_atoms = [renum_map[anum] for anum in heavy_atoms]

    natoms = complex_st.atom_total
    build.add_hydrogens(complex_st, atom_list=heavy_atoms)

    het_atoms = heavy_atoms + list(range(natoms + 1, complex_st.atom_total + 1))

    return het_atoms


[docs]def apply_state_and_calc_score(complex_st, state_st):
    """
    Apply the state <state_st> to <complex_st>, and return the score for the
    state in the context of the protein complex.

    :param complex_st: Original complex structure (receptor + het)
    :type complex_st: `structure.Structure`

    :param state_st: Epik output for the het group. May include some atoms
            from the receptor if the ligand is covalently-bound.
    :type state_st: `structure.Structure`

    :return: Tuple of state score, Epik penalty, and information string.
            Epik penalty will be None for metal states.
    :rtype: (float, float/None, str)
    """
    # TODO Move this to be an instance of HetState in the future
    het_atoms = apply_state(complex_st, state_st)
    state_penalty, metal_penalties = get_state_penalties(state_st)
    state_score, info_str = calc_state_score(complex_st, het_atoms,
                                             state_penalty, metal_penalties)
    return state_score, state_penalty, info_str


[docs]def get_state_penalties(state_st):
    """
    Return the Epik state penalty for this Epik output structure, as well as
    metal penalties for each atom that has the r_epik_Metal_State_Penalty
    property set.
    """
    # Metal states don't go through Epik, so don't have a penalty.
    # Setting it to zero here will effectively prevent sorting of
    # states for metal hets.
    state_penalty = state_st.property.get(STATE_PANELTY_PROP, 0.0)

    metal_penalties = {}
    for state_atom in state_st.atom:
        state_anum = state_atom.property.get(ORIG_ANUM_PROP)
        if state_anum is None:
            # This as an added hydrogen
            continue
        metal_penalty = state_atom.property.get(METAL_PENALTY_PROP)
        if metal_penalty is not None:
            metal_penalties[state_anum] = metal_penalty

    return state_penalty, metal_penalties


[docs]def get_smallest_metal_penalty(complex_st, metal_penalties):
    """
    Return the lowest metal penalty of all het atoms, considering only atoms
    that are within 3A of a receptor metal.
    """

    all_metals = analyze.evaluate_asl(complex_st, 'metals')

    smallest_metal_penalty = None
    for anum, metal_penalty in metal_penalties.items():
        try:
            complex_atom = find_ppw_atom(complex_st, anum)
        except ValueError:
            # This hydrogen was added by Epik and is not in the input structure.
            continue

        # Check if any metal within 3A exists AND this atom has a lower
        # metal penalty than previously seen atoms close to metals.
        if smallest_metal_penalty is None or metal_penalty < \
                smallest_metal_penalty:
            for metal_anum in all_metals:
                if complex_st.measure(metal_anum, complex_atom) <= 3:  # 3A
                    smallest_metal_penalty = metal_penalty
                    break
    return smallest_metal_penalty


[docs]def calc_hbond_energy(hydrogen, acceptor):
    """
    Calculate H-bond energy. Overly simple, to address PLDB-3337 and similar.
    """

    if RANK_NH_O_HIGHER and hydrogen.element == 'H':
        donor = next(hydrogen.bonded_atoms)
        if donor.element == 'N' and acceptor.element == 'O':
            return HBOND_ENERGY_NHO
    return HBOND_ENERGY


[docs]def calc_state_score(complex_st, het_atoms, state_penalty, metal_penalties):
    """
    Calculate the score for the current het state (must already been applies
    to the complex structure).

    :het_atoms: List of atoms (complex indexed) that are part of the het.
    """

    smallest_metal_penalty = get_smallest_metal_penalty(complex_st,
                                                        metal_penalties)

    # Total score (higher is better):
    score = 0.0

    # Calculate the H-Bonds contribution:
    hbonds = hbond.get_hydrogen_bonds(complex_st, het_atoms)
    hbond_count = len(hbonds)

    for bond in hbonds:
        score -= calc_hbond_energy(*bond)

    if state_penalty is not None:
        info_str = "State penalty: %.2f kcal/mol" % state_penalty
        score -= state_penalty
    else:
        # No state penalty; this is a metal state. Score will be 0.0.
        info_str = "State penalty: NA"

    if smallest_metal_penalty is not None:
        # Reward all states with metal interactions by up to 10 points.
        # Originally it was planned to replace the state penalty with the
        # smallest metal state penalty, but that caused non-metal interacting
        # states with better state penalties to be selected.
        # Metal state penalties are small positive values - lower is better.
        score += (10.0 - smallest_metal_penalty)
        info_str = "Metal state penalty: %.2f kcal/mol" % smallest_metal_penalty

    # Penalize amides that have an extra hydrogen that can clash
    # with a metal. Currently only terminal amides are handled - in
    # the future, consider making this algorithm more generic.
    num_metal_clashes = get_num_clashes_with_metals(complex_st, het_atoms)
    score -= num_metal_clashes * METAL_AMIDE_CLASH_PENALTY

    info_str += "; H-Bond count: %i" % hbond_count

    total_charge = 0
    for anum in het_atoms:
        total_charge += complex_st.atom[anum].formal_charge

    if total_charge > 0:
        charge_str = 'Q: +%i' % total_charge
    else:
        charge_str = 'Q: %i' % total_charge

    # The info string shows the Epik state penalty and the number of
    # hydrogen bonds between this state and the receptor.
    info_str += "; %s" % charge_str

    return score, info_str


def _generate_metal_and_ion_states(st):
    """
    Ionize the metal or ion atom in the specified state structure (only one,
    if exists) and return a list of ionized sts.
    """

    out_states = []

    # Ionize the metal:
    for atom in st.atom:
        charges = METAL_AND_ION_CHARGES.get(atom.atomic_number)
        if charges:
            # The original state should be the first state (Ev:65897):
            out_states.append(st)
            for ch in charges:
                if ch != atom.formal_charge:
                    stc = st.copy()
                    stc.atom[int(atom)].formal_charge = ch
                    stc.retype()
                    out_states.append(stc)
    if not out_states:
        # No metal in <st>, return the original:
        out_states.append(st)
    return out_states


[docs]def generate_phosphate_and_sulfur_states(sts):
    """
    For each ligand state, expand states for the phosphate and sulfate groups.
    """

    expanded_sts = []
    for st in sts:
        # For each input state, generate metal and phosphate states
        # Ev:78688
        if count_phosphates_and_sulfurs(st) > 4:
            expanded_sts.append(st)
            continue

        for st2 in extend_phosphate_states(st):
            for st3 in extend_sulfate_states(st2):
                expanded_sts.append(st3)

    return expanded_sts


[docs]def generate_metal_and_ion_states(sts):
    # If Epik is not run, the following is done only to the original het:

    expanded_sts = []

    for st in sts:
        for st2 in _generate_metal_and_ion_states(st):
            expanded_sts.append(st2)

    return expanded_sts


def _find_waters(st):
    """
    Return a list of water atoms. Each item in the list is a tuple of 3 water
    atom indices. Waters that don't have two hydrogens are skipped.
    """

    water_asl = 'water and (atom.ele O)'
    water_atoms = analyze.evaluate_asl(st, water_asl)

    waters_list = []

    for ai in water_atoms:
        a = st.atom[ai]

        # Make a list of the atoms of this water:
        curr_water_atoms = [ai]
        for h in a.bonded_atoms:
            if h.atomic_number == 1:
                curr_water_atoms.append(h.index)

        if len(curr_water_atoms) != 3:
            print('    WARNING: skipping water since it has %i hydrogens' %
                  (len(curr_water_atoms) - 1))
            continue

        curr_water_atoms = tuple(curr_water_atoms)

        waters_list.append(curr_water_atoms)

    return waters_list


[docs]def get_bridging_waters(st, min_hbonds=3):
    """
    Return a list of all waters in the specified structure that make at least
    <min_hbonds> number of H-bonds (H-bonds to other waters excluded).
    The list contains both oxygen and hydrogen atoms (if present).
    """

    waters_list = _find_waters(st)
    all_water_atoms = set()
    for water_mol_atoms in waters_list:
        all_water_atoms.update(water_mol_atoms)

    def _get_num_nonwater_hbonds(water_mol_atoms):
        count = 0

        for (atom1, atom2) in hbond.get_hydrogen_bonds(st, water_mol_atoms):
            anum1 = int(atom1)
            anum2 = int(atom2)
            if anum1 in water_mol_atoms:
                other_atom = anum2
            else:
                other_atom = anum1
            if other_atom not in all_water_atoms:
                count += 1
        return count

    bridging_water_atoms = []
    for water_atoms in waters_list:
        num_hbonds = _get_num_nonwater_hbonds(water_atoms)
        if num_hbonds >= min_hbonds:
            bridging_water_atoms += water_atoms

    return bridging_water_atoms


[docs]def hydrogen_neighbors(st, atoms):
    """
    Returns the list of neighbor (bonded) atoms that are hydrogens.

    :param st: Structure where atoms are from.
    :type st: `structure.Structure`

    :param atoms: List of atom indices for the heavy atoms.
    :type atoms: list of ints

    :return: List of hydrogen atom indices
    :rtype: list of ints.
    """

    hydrogens = []
    for heavy_atom in atoms:
        for bonded_atom in st.atom[heavy_atom].bonded_atoms:
            if bonded_atom.index not in hydrogens and bonded_atom.atomic_number == 1:
                hydrogens.append(bonded_atom.index)
    return hydrogens


[docs]def get_num_clashes_with_metals(st, het_anums):
    """
    Returns the number of times there is a hydrogen that is clashing with
    a metal. For now, only hydrogens on aliphatic nitrogens that are zero-
    order bonded to metals are considered. Geometry is not considered, because
    amides are rotatable groups.

    :param st: Structure where atoms are from.
    :type st: structure.Structure

    :param het_anumst of atom indices for the heavy atoms.
    :type het_anumst of ints

    :return: List of nitrogen atoms.
    :rtype: list(structure._StructureAtom)
    """
    num_clashes = 0
    for anum in het_anums:
        atom = st.atom[anum]
        if atom.element != 'N':
            continue
        zob_present = False
        h_present = False
        r_groups = 0
        valence = 0
        for bond in atom.bond:
            if bond.order == 0:
                zob_present = True
            else:
                valence += bond.order
                if bond.atom2.element == 'H':
                    h_present = True
                else:
                    r_groups += 1
        if zob_present and h_present and r_groups == 1 and valence == 4:
            num_clashes += 1

    return num_clashes


[docs]def get_pdb_id(st):
    """
    Returns the PDB ID of the given structure. If the property does not exist,
    returns the string "unknown".
    """
    pdbid = st.property.get("s_pdb_PDB_ID")
    if not pdbid:
        return "unknown"
    return pdbid.lower()


[docs]def get_het_name(st, het_atoms, include_chain=False):
    """
    Return the het "name" to display for the user for the given het. Assumes
    all residues have the same chain name.

    Example outputs:
        "HEM 123a"
        "HEM A:123a"
        "GLY-VAL-PRO 110-111-112"
        "GLY-VAL-PRO A:110-111-112"

    :param st: Structure containing the het
    :type st: `structure.Structure`

    :param het_atoms: Atoms from this het.
    :type het_atoms: list of ints

    :param include_chain: Whether the chain name should be included.
    :type include_chain: bool
    """

    residues = []
    for ai in het_atoms:
        res = st.atom[ai].getResidue()
        if not res in residues:
            residues.append(res)

    name_str = '-'.join([res.pdbres.strip() for res in residues]) + ' '

    if include_chain:
        name_str += st.atom[het_atoms[0]].chain + ':'

    name_str += '-'.join(
        ['%i%s' % (res.resnum, res.inscode.strip()) for res in residues])
    return name_str


[docs]def add_prepared_props(st):
    """
    Adds "prepared" and "prepared with version" properties to the given
    structure.
    """
    st.property['b_ppw_prepared'] = True
    st.property['s_ppw_prepared_with_version'] = schrodinger.get_release_name()


[docs]def generate_bio_units(st):
    """
    If the given structure has Biounit data, generate conformations for
    each biounit. If multiple bio units are defined, multiple structures
    are returned - oner per bio unit.
    If no biounit data is present, original structure is returned (in list).
    """
    biounits = biounit.biounits_from_structure(st)
    if not biounits:
        return [st]
    sts = [biounit.apply_biounit(st, bu) for bu in biounits]
    return sts


[docs]def apply_het_states(orig_st, state_sts, logger):
    """
    For each het state, generate a complex structure with that state applied,
    and return a list of (state score, complex structure), one for each state.

    :param orig_st: Input protein complex structure.
    :type orig_st: structure.Structure

    :param state_sts: List of ligand states to apply.
    :type state_sts: List[structure.Structure]

    :param logger: Logger to report output to.
    :type logger: logging.Loggger

    :return: List of state score and complex structure tuples
    :rtype: List[Tuple[float, structure.Structure]]
    """

    # For each state for this het:
    hetname = get_het_name(state_sts[0],
                           state_sts[0].getAtomIndices(),
                           include_chain=True)
    logger.debug("Processing het %s: " % hetname)

    # TODO Convert this to a list of HetState objects in the future:
    applied_states = []

    # For each het/lig structure, modify the input complex structure so that
    # its het has the same ionization state:
    for state_st in state_sts:
        complex_st = orig_st.copy()

        # Apply the state and calculate the score:
        state_score, epik_penalty, info_str = apply_state_and_calc_score(
            complex_st, state_st)
        applied_states.append((state_score, complex_st))

    logger.debug('  number of states applied: %i' % len(applied_states))

    return applied_states


[docs]def delete_far_waters(st, angstroms=5):
    het_atom_numbers = get_het_atom_numbers(st)

    if het_atom_numbers:  # hets found
        hets_asl = atomsasl(het_atom_numbers)
        far_waters_asl = "water and not fillres within %s (%s)" % (angstroms,
                                                                   hets_asl)
    else:
        far_waters_asl = "water"

    far_water_atoms = analyze.evaluate_asl(st, far_waters_asl)

    st.deleteAtoms(far_water_atoms)


[docs]def delete_non_bridging_waters(st, delwater_hbond_cutoff):
    """
    Delete waters that do not make at least this number of hydrogen bonds
    to non-waters.
    """

    # Make a list of all water atoms:
    water_atoms = analyze.evaluate_asl(st, "water")

    # Make a list of bridging water atoms:
    bridging_water_atoms = get_bridging_waters(st, delwater_hbond_cutoff)

    # Make a list of water atom that need to be deleted:
    del_atoms = [a for a in water_atoms if a not in bridging_water_atoms]
    del_waters = [
        str(st.atom[a].getResidue())
        for a in del_atoms
        if st.atom[a].element == "O"
    ]

    if del_waters:
        st.deleteAtoms(del_atoms)

    return del_waters


[docs]def idealize_hydrogen_temp_factor(st):
    """
    Set temperature factor for hydrogens without temperature factors to
    that of the bonded heavy atom (if available). This is important
    for X-ray refinement to prevent R-factor collapse.

    :param st: Input Structure
    :type st: Schrodinger.structure
    :return: None, but alters hydrogen temperature factors in place.
    """
    for atom in st.atom:
        if atom.atomic_number != 1 or atom.temperature_factor:
            continue
        for heavy_atom in atom.bonded_atoms:
            temperature_factor = heavy_atom.temperature_factor
            if temperature_factor is not None:
                atom.temperature_factor = temperature_factor
            if atom.alt_xyz is None:
                continue
            if heavy_atom.alt_xyz is None:
                temperature_factor = heavy_atom.temperature_factor
            else:
                temperature_factor = heavy_atom.property.get(
                    'r_m_alt_pdb_tfactor')
            if temperature_factor is not None:
                atom.property['r_m_alt_pdb_tfactor'] = temperature_factor


[docs]def get_het_atom_numbers(st):
    """
    Get the list of het groups in a structure

    :param st: Input Structure
    :type st: schrodinger.structure.Structure
    :return: atom numbers in hetero atom fields. Each list is the atoms within
        a given hetero group
    :rtype: list of list of integers
    """
    het_atom_numbers = []  # List of atoms number for ALL hets
    for het_atoms in findhets.find_hets(st):
        for ai in het_atoms:
            het_atom_numbers.append(ai)
    return het_atom_numbers


[docs]class PrepWizardSettings(parameters.CompoundParam):
    jobname: str = 'prepwizard'
    force_field: str = '2005'
    preprocess: bool = True
    assign_bond_orders: bool = True
    use_ccd: bool = True
    add_hydrogens: bool = True
    readd_hydrogens: bool = False
    idealize_hydrogen_tf: bool = True
    add_terminal_oxygens: bool = False
    cap_termini: bool = False
    preprocess_delete_far_waters: bool = False
    preprocess_watdist: float = None
    delete_far_waters: bool = True
    watdist: float = 5.0
    treat_metals: bool = True
    treat_disulfides: bool = False
    treat_glycosylation: bool = False
    treat_palmitoylation: bool = False
    selenomethionines: bool = False
    antibody_cdr_scheme: annotation.AntibodyCDRScheme = annotation.DEFAULT_ANTIBODY_SCHEME
    renumber_ab_residues: bool = False
    fillsidechains: bool = False
    fillloops: bool = False
    custom_fasta_file: bool = None
    run_epik: bool = True
    use_epikx: bool = False
    max_states: int = 1
    run_protassign: bool = True
    samplewater: bool = False
    include_epik_states: bool = False
    xtal: bool = False
    pH: float = 7.0
    epik_pH: float = 7.0
    epik_pHt: float = 2.0
    use_propka: bool = True
    propka_pH: float = 7.0
    label_pkas: bool = False
    force_list: list = []
    minimize_adj_h: bool = False
    protassign_number_sequential_cycles: int = None
    protassign_max_cluster_size: int = None
    delwater_hbond_cutoff: int = 0
    delete_nonbridging_waters: bool = False
    run_impref: bool = True
    rmsd: float = 0.3
    fixheavy: bool = False
    reference_struct: structure.Structure = None
    use_PDB_pH: bool = False
    preserve_st_titles: bool = False


[docs]def update_task_from_settings(task, settings):
    """
    Update the given PPWorkflowTask with these given settings.
    """
    input = task.input
    input.do_preprocess = settings.preprocess
    input.do_hbond = settings.run_protassign
    input.do_cleanup = settings.run_impref or settings.delete_far_waters or settings.delete_nonbridging_waters
    # PPWorkflowTask has a do_cleanup parameters, while PrepWizardSettings
    # does not (the task combines # 3 operations into one).

    pwtasks.update_params(input.preprocess, settings)
    pwtasks.update_params(input.preprocess.generate_states_settings, settings)
    pwtasks.update_params(input.hbond, settings)
    pwtasks.update_params(input.cleanup, settings)


@application.require_application(create=True, use_qtcore_app=True)
def prepare_structure(st, options, logger=None):
    """
    Prepare the given protein structure using PrepWizard.

    :param st: Protein structure to prepare.
    :type st: structure.Structure

    :param options: PrepWizard options.
    :type options: PrepWizardSettings

    :param logger: Optional logger to use.
    :type logger: log.logging.Logger

    :return: Prepared structures. Depending on the number of hets in
        the protein, up to options.max_states number of structures will be produced;
        by default 1.
    :rtype: List[structure.Structure]
    """

    task = pwtasks.PPWorkflowTask()
    update_task_from_settings(task, options)
    task.specifyTaskDir(os.getcwd())
    task.name = options.jobname
    task.logger = logger

    task.input.struct = st

    # Run the backend method, without adding a job control layer. This also
    # causes all output to go to stdout instead of log files.
    task.backendMain()

    expanded_sts = task.output.structs

    if logger:
        logger.info('  Number of states generated: %i' % len(expanded_sts))

    return expanded_sts


[docs]def tag_st_het_num_prop(sts):
    """
    Set the hetero atom number property to the sequence number in `sts` on the
    structures
    :param sts: the structures to tag
    :type sts: List[structure.Structure]
    """
    for idx, st in enumerate(sts, start=1):
        st.property[HET_NUM_PROP] = idx