Source code for schrodinger.protein.helm

"""
A module to parse convert biomolecules to and from HELM strings.

The Hierarchical Editing Language for Macromolecules(HELM) was designed to
create a single notation that can encode the structure of all biomolecules.

HELM encodes different types of biomolecules, including XNA, peptides, and Chem
polymers, but also allows users to defined custom polymer types.

The HELM specification can be found at
https://pistoiaalliance.atlassian.net/wiki/spaces/PUB/pages/13795362/HELM+Notation
"""
from rdkit import Chem

from collections import defaultdict
from dataclasses import dataclass
import re

MONOMER_INDEX_PROP_NAME = "_monomerIndex"
ATOM_LABEL_PROP_NAME = "atomLabel"
MOL_LABEL_PROP_NAME = "_name"
TOKEN_POSITION_PROP_NAME = "tokenPosition"
IS_BRANCH_MONOMER_PROP_NAME = "isBranchMonomer"
CONNECTION_ATTACHMENT_POINTS_PROP_NAME = "attachmentPoints"

SUPPORTED_POLYMERS = {"PEPTIDE", "DNA", "RNA"}

POLYMER_ID_PATTERN = re.compile(
    r"""^
        ([A-Z]+) # the polymer type
        [1-9]\d* # the polymer number >= 1
        $
        """, re.X)
SIMPLE_POLYMER_PATTERN = re.compile(
    r"""^
        ([^\}]+) # simple polymer token cannot contain '}' character
        (\"[^$]+\")? # inline annotation eg. "mutant"
        $
        """, re.X)
# SIMPLE_POLYMER_PATTERN(|SIMPLE_POLYMER_PATTERN)*
SIMPLE_POLYMERS_SECTION_PATTERN = re.compile(
    r"""^
         ([A-Z]+\d+)\{(.+)\}(\".+\")? # simple polymer pattern
         (|([A-Z]+\d+)\{(.+)\}(\".+\")?)* #repeated simple polymers separated by '|'
         $
         """, re.X)

# example RNA1,CHEM1,21:R2‐1:R1
CONNECTION_PATTERN = re.compile(
    r"""^
        (?P<from_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE1,
        (?P<to_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE2,
        (?P<from_monomer_position>[1-9]\d*) # eg. 21
        \:(?P<from_attachment_point>R[1-9]\d*)\- # attachment point eg. R2
        (?P<to_monomer_position>[1-9]\d*) # eg. 23
        \:(?P<to_attachment_point>R[1-9]\d*) # eg. R1
        $
        """, re.X)
# example RNA1,CHEM1,21:pair‐1:pair
HYDROGEN_BOND_PATTERN = re.compile(
    r"""^
       (?P<from_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE1,
       (?P<to_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE2,
       (?P<from_monomer_position>[1-9]\d*) # eg. 21
       \:(?P<from_attachment_point>pair)\-
       (?P<to_monomer_position>[1-9]\d*) # eg. 23
       \:(?P<to_attachment_point>pair)
       $
       """, re.X)


[docs]@dataclass
class Monomer:
    position_in_polymer: int
    position_in_token: int
    name: str
    is_branch: bool


[docs]def to_rdkit(helm: str) -> Chem.Mol:
    """
    Converts a helm string to a residue-level ROMol
    """
    helm_version = helm[helm.rfind("$") + 1:]

    if not helm_version:
        helm2 = convert_helm1_to_helm2(helm)
        return _cg_mol_from_helm2(helm2)
    elif helm_version.upper() == "V2.0":
        return _cg_mol_from_helm2(helm)

    raise ValueError("Only HELM and HELMV2.0 are supported.")


[docs]def to_helm(cg_mol: Chem.Mol) -> str:
    """
    Converts a coarse-grain residue-level ROMol to a helm string
    """
    return _cg_mol_to_helm2(cg_mol)


def _validate_polymer_id(polymer_id: str) -> None:
    """
    a valid polymer id should have a polymer type in caps followed by a
    number. Eg., PEPTIDE1

    :raises ValueError: If the polymer id doesn't match the HELM specification.

    :raises ValueError: If the polymer type is unsupported.
    """
    polymer_id_match = POLYMER_ID_PATTERN.match(polymer_id)

    if not polymer_id_match:
        raise ValueError(f"The polymer id '{polymer_id}' is not valid.")

    polymer_type = polymer_id_match.group(1)
    if polymer_type not in SUPPORTED_POLYMERS:
        raise ValueError(
            f"The polymer type '{polymer_type}' is not currently supported.")


def _validate_connection_token(connection_token):
    """
    Helper function to validate a connection token.

    A valid connection token should be of the form
        SourcePolymerID,TargetPolymerID,SourceMonomerPosition:SourceAttachmentPoint-TargetMonomerPosition:TargetAttachmentPoint

    :raises ValueError: If the connection token does not match the HELM spcification.
    """
    connection_match = CONNECTION_PATTERN.match(connection_token)
    if connection_match is not None:
        return

    hydrogen_bond_match = re.match(HYDROGEN_BOND_PATTERN, connection_token)
    if hydrogen_bond_match is not None:
        return

    raise ValueError(f"Invalid connection: {connection_token}.")


def _validate_simple_polymers_section(simple_polymers_section):
    """
    Helper function to validate the simple polymers HELM section.

    :raises ValueError: If the simple polymers section token doesn't match the
                        HELM specification.
    """
    if re.search(SIMPLE_POLYMERS_SECTION_PATTERN,
                 simple_polymers_section) is None:
        raise ValueError(f"Invalid simple polymers: {simple_polymers_section}.")


def _validate_simple_polymer(simple_polymer):
    """
    Helper function to validate a simple polymer token.

    :raises ValueError: If the simple polymer token doesn't match the HELM
                        specification.
    """
    if re.search(SIMPLE_POLYMER_PATTERN, simple_polymer) is None:
        raise ValueError(f"Invalid simple polymer: {simple_polymer}.")


[docs]def convert_helm1_to_helm2(helm1: str) -> str:
    """
    Helper function to convert helm1 to helm2

    The helm specifications changed from
        simple polymers$connections$H-bonds$annotations$
    to
        simple polymers$connections(including H-bonds)$polymer groups$annotations$V2.0
    """
    converted_helm2_sections = [""] * 4

    # since the first section is still legal, we just add it
    sections = helm1.split("}$")

    if len(sections) == 1:
        raise ValueError("Invalid simple polymers")

    converted_helm2_sections[0] = sections[0] + "}"

    remaining_helm1 = "}$".join(sections[1:])
    remaining_sections = remaining_helm1.split("$")

    # add information about connections
    connections = [remaining_sections[0]]
    # if join the connections section to the hydrogen bonds section
    if remaining_sections[1]:
        connections.append(remaining_sections[1])
    converted_helm2_sections[1] = "|".join(connections)

    # add the remaining section and the version info
    converted_helm2_sections[3] = remaining_sections[2]
    return "$".join(converted_helm2_sections) + "$V2.0"


def _get_multiple_character_monomer_token(simple_polymer_iterator):
    """
    Since a multi character token is surrounded by square brackets, we need to
    keep track of the depth of square brackets in the token, and return the token.

    :param simple_polymer_iterator: a string iterator on the current polymer


    Example:
        input: N[C@H](C)C(=O)O].A.P
        output: N[C@H](C)C(=O)O
    """
    depth = 1
    current_monomer = ""

    for char in simple_polymer_iterator:
        if char == "[":
            depth += 1
        elif char == "]":
            depth -= 1
            # if we find a valid token, e.g. [token], we need to yield it and
            # release ownership of the iterator
            if depth == 0:
                return current_monomer
        current_monomer += char


def _get_monomers(simple_polymer: str) -> Monomer:
    """
    Returns the monomer position within the polymer, monomer position within the
    containing token, the monomer name, and whether the monomer is a branch monomer

    Monomer tokens are defined as dot-separated values within the curly braces
    of a simple polymer. A valid monomer tokens can be either a single
    capitalized letter of the alphabet, or multiple characters surrounded
    by square brackets. A multiple character monomer token may be a SMILES
    string or a branched monomer token , i.e. multiple monomers in one monomer
    token with one monomer surrounded by brackets.

    Currently not supporting ambiguous monomers.

    Example:
        input: A.A(dF)P
        output: Monomer(1, 1, A, False), Monomer(2, 1, A, False), Monomer(3, 2, dF, True), Monomer(4, 3, P, False)
    """
    current_monomer = ""
    simple_polymer = iter(simple_polymer)

    # monomer/ token positions are synonymous to residue numbers and are 1-based
    monomer_position = 1
    token_position = 1
    is_branch_monomer = False

    for char in simple_polymer:
        if char == ".":
            if current_monomer:
                yield Monomer(monomer_position, token_position, current_monomer,
                              is_branch_monomer)
                monomer_position += 1
            current_monomer = ""
            token_position += 1
        elif char == "[":
            if current_monomer:
                yield Monomer(monomer_position, token_position, current_monomer,
                              is_branch_monomer)
                monomer_position += 1
            current_monomer = ""
            multiple_character_monomer_token = _get_multiple_character_monomer_token(
                simple_polymer)
            if multiple_character_monomer_token is not None:
                yield Monomer(monomer_position, token_position,
                              multiple_character_monomer_token,
                              is_branch_monomer)
                monomer_position += 1
        elif char == "(":
            if current_monomer:
                yield Monomer(monomer_position, token_position, current_monomer,
                              is_branch_monomer)
                monomer_position += 1
            current_monomer = ""
            is_branch_monomer = True
        elif char == ")":
            if current_monomer:
                yield Monomer(monomer_position, token_position, current_monomer,
                              is_branch_monomer)
                monomer_position += 1
            current_monomer = ""
            is_branch_monomer = False
        else:
            current_monomer += char

    if current_monomer:
        yield Monomer(monomer_position, token_position, current_monomer,
                      is_branch_monomer)


def _get_connections(connection_section: str) -> list:
    """
    Returns valid connection tokens found in the connections section of a
    HELM string

    Currently not supporting ambiguous connections.

    Example:
        input: "RNA1,CHEM1,2:R1-1:R2|RNA1,CHEM1,20:R-1:R2"
        output: ["RNA1,CHEM1,2:R1-1:R2", "RNA1,CHEM1,20:R-1:R2"]
    """
    if not connection_section:
        return []

    for connection_token in connection_section.split("|"):
        _validate_connection_token(connection_token)
        yield connection_token


def _process_simple_polymer(simple_polymer: str, polymer_id: str) -> Chem.Mol:
    """
    Returns a residue-level Chem.Mol object

    Example:
        input: A.A.A.A.[dF]
        output: Chem.Mol object (A-A-A-A-dF)
    """
    rwmol = Chem.RWMol()
    rwmol.BeginBatchEdit()
    prev_atom_idx = -1

    for monomer in _get_monomers(simple_polymer):
        new_atom_idx = rwmol.AddAtom(Chem.Atom(0))
        new_atom = rwmol.GetAtomWithIdx(new_atom_idx)
        new_atom.SetIntProp(MONOMER_INDEX_PROP_NAME,
                            monomer.position_in_polymer)
        new_atom.SetProp(ATOM_LABEL_PROP_NAME, monomer.name)
        new_atom.SetProp(MOL_LABEL_PROP_NAME, polymer_id)
        new_atom.SetIntProp(TOKEN_POSITION_PROP_NAME, monomer.position_in_token)
        new_atom.SetBoolProp(IS_BRANCH_MONOMER_PROP_NAME, monomer.is_branch)

        if prev_atom_idx == -1:
            prev_atom_idx = new_atom_idx
            continue

        rwmol.AddBond(prev_atom_idx, new_atom_idx, Chem.BondType.SINGLE)
        if not monomer.is_branch:
            prev_atom_idx = new_atom_idx

    rwmol.CommitBatchEdit()
    return Chem.Mol(rwmol)


def _get_simple_polymers(simple_polymers: str) -> (str, str):
    """
    Returns each valid simple polymer as a separate token from the simple
    polymers string.

    :param simple_polymers: the first section of a HELM2 string

    Example:
        input: PEPTIDE1{A.A}|PEPTIDE2{G.G}
        output: (PEPTIDE1, A.A), (PEPTIDE2, G.G)
    """
    seen_simple_polymers = set()
    current_polymer_id = []

    i = 0
    num_chars = len(simple_polymers)
    while i < num_chars:
        char = simple_polymers[i]
        if char == "{":
            current_polymer_id = "".join(current_polymer_id)
            if current_polymer_id in seen_simple_polymers:
                raise ValueError(
                    f"The polymer id, '{current_polymer_id}' is not unique.")

            _validate_polymer_id(current_polymer_id)
            # we need to get the information inside the '{' and '}' tokens
            i += 1
            token_start_index = i
            while (i < num_chars) and (simple_polymers[i] != "}"):
                i += 1
            polymer_token = simple_polymers[token_start_index:i]
            _validate_simple_polymer(polymer_token)

            yield current_polymer_id, polymer_token
            seen_simple_polymers.add(current_polymer_id)
            current_polymer_id = []
            continue
        elif char == "}" or char == "|":
            current_polymer_id = []
        else:
            current_polymer_id.append(char)
        i += 1


def _process_simple_polymers(simple_polymers: str) -> Chem.Mol:
    """
    Processes all polymer tokens in a first section of a HELM2 string
    and returns them as a residue-level Chem.Mol object

    Example:
        input: PEPTIDE1{A.A.A}
        output: Chem.Mol object (A-A-A)
    """
    _validate_simple_polymers_section(simple_polymers)

    for polymer_id, polymer_token in _get_simple_polymers(simple_polymers):
        simple_polymer = _process_simple_polymer(polymer_token, polymer_id)
        yield simple_polymer


def _get_helm2_sections(helm2: str) -> (int, str):
    """
    Returns the simple polymers, connections, polymer group, and annotations
    sections of a HELM2 string as separate tokens.

    :param helm2: a valid HELM2 string

    Example:
        input: PEPTIDE1{A.A.A}$$$$V2.0
        output: (1, PEPTIDE1{A.A.A}), (2,""), (3, ""), (4, "")
    """
    current_helm_section = 1
    helm_section_token = []

    for char in helm2:
        # we're done processing the relevant sections
        if current_helm_section > 4:
            return

        if char == "$":
            # '$' can be present in cxxsmiles in first section so, we need to handle that
            if current_helm_section == 1 and helm_section_token[-1] != "}":
                helm_section_token.append(char)
                continue

            yield current_helm_section, "".join(helm_section_token)
            current_helm_section += 1
            helm_section_token = []
        else:
            helm_section_token.append(char)


def _build_connections(connection_tokens: list, cg_mol: Chem.Mol) -> Chem.Mol:
    """
    Takes connection information and create bonds between atoms that should be
    connected.
    """
    connection_atoms = set()
    connections = {}
    for connection_token in connection_tokens:
        connection_match = CONNECTION_PATTERN.match(connection_token)
        if connection_match is None:
            connection_match = HYDROGEN_BOND_PATTERN.match(connection_token)

        from_polymer = connection_match.group("from_polymer_id")
        to_polymer = connection_match.group("to_polymer_id")

        from_monomer_position = connection_match.group("from_monomer_position")
        to_monomer_position = connection_match.group("to_monomer_position")
        from_attachment_point = connection_match.group("from_attachment_point")
        to_attachment_point = connection_match.group("to_attachment_point")

        from_monomer_key = f"{from_polymer}_{from_monomer_position}"
        to_monomer_key = f"{to_polymer}_{to_monomer_position}"

        connection_atoms.add(from_monomer_key)
        connection_atoms.add(to_monomer_key)

        connections[from_monomer_key] = {
            "to_monomer": to_monomer_key,
            "from_attachment_point": from_attachment_point,
            "to_attachment_point": to_attachment_point
        }

    connection_atoms_lookup = {}
    for atom in cg_mol.GetAtoms():
        monomer_key = f"{atom.GetProp(MOL_LABEL_PROP_NAME)}_{atom.GetProp(MONOMER_INDEX_PROP_NAME)}"
        if monomer_key in connection_atoms:
            connection_atoms_lookup[monomer_key] = atom

    # verify that the all connection atoms can be found
    if not all([
            connection_atoms_lookup.get(connection_atom, None)
            for connection_atom in connection_atoms
    ]):
        raise ValueError(
            "Some atoms specified in the connections section must be couldn't be located."
        )

    rwmol = Chem.RWMol(cg_mol)
    rwmol.BeginBatchEdit()
    for key, connection_info in connections.items():
        from_attachment_point = connection_info["from_attachment_point"]
        to_attachment_point = connection_info["to_attachment_point"]
        bond_type = Chem.BondType.ZERO if from_attachment_point == "pair" else Chem.BondType.SINGLE
        from_atom = connection_atoms_lookup[key]
        to_atom = connection_atoms_lookup[connection_info["to_monomer"]]
        rwmol.AddBond(from_atom.GetIdx(), to_atom.GetIdx(), bond_type)
        new_bond = rwmol.GetBondBetweenAtoms(from_atom.GetIdx(),
                                             to_atom.GetIdx())
        new_bond.SetProp(CONNECTION_ATTACHMENT_POINTS_PROP_NAME,
                         f"{from_attachment_point}-{to_attachment_point}")

    rwmol.CommitBatchEdit()
    return Chem.Mol(rwmol)


def _cg_mol_from_helm2(helm2):
    simple_polymers_section, connections_section, *_ = [
        section_token for _, section_token in _get_helm2_sections(helm2)
    ]

    cg_mol = Chem.Mol()
    for mol in _process_simple_polymers(simple_polymers_section):
        cg_mol = Chem.CombineMols(cg_mol, mol)

    connection_tokens = list(_get_connections(connections_section))
    if connection_tokens:
        cg_mol = _build_connections(connection_tokens, cg_mol)

    return cg_mol


def _get_polymers_info(cg_mol):
    """
    Given a coarse-grain ROMol encoding a residue-level biomolecule, returns
    a polymer id and a dictionary with the keys as monomer positions and values
    as the associated ROMol object.
    """
    polymer_info_lookup = defaultdict(dict)
    for atom in cg_mol.GetAtoms():
        polymer_id = atom.GetProp(MOL_LABEL_PROP_NAME)
        monomer_position = int(atom.GetProp(MONOMER_INDEX_PROP_NAME))
        polymer_info_lookup[polymer_id][monomer_position] = atom

    for polymer_id, polymer_info in polymer_info_lookup.items():
        yield polymer_id, polymer_info


def _construct_monomer_helm(monomer):
    monomer_helm = monomer.GetProp(ATOM_LABEL_PROP_NAME)
    if len(monomer_helm) > 1:
        monomer_helm = f"[{monomer_helm}]"

    if monomer.GetBoolProp(IS_BRANCH_MONOMER_PROP_NAME):
        monomer_helm = f"({monomer_helm})"

    return monomer_helm


def _construct_connection_helm(bond):
    from_monomer = bond.GetBeginAtom()
    to_monomer = bond.GetEndAtom()

    from_polymer_id = from_monomer.GetProp(MOL_LABEL_PROP_NAME)
    to_polymer_id = to_monomer.GetProp(MOL_LABEL_PROP_NAME)

    from_monomer_position = from_monomer.GetIntProp(MONOMER_INDEX_PROP_NAME)
    to_monomer_position = to_monomer.GetIntProp(MONOMER_INDEX_PROP_NAME)

    from_attachment_point, to_attachment_point = bond.GetProp(
        CONNECTION_ATTACHMENT_POINTS_PROP_NAME).split("-")

    connection_helm = f"{from_polymer_id},{to_polymer_id},{from_monomer_position}:{from_attachment_point}-{to_monomer_position}:{to_attachment_point}"
    return connection_helm


def _construct_polymer_helm(polymer_id, polymer_info):
    """
    Given a polymer id and a dictionary mapping monomer positions to the
    associated ROMol object, constructs and returns a HELM2 representation
    of the polymer.
    """
    num_monomers = len(polymer_info)
    monomer_token_lookup = defaultdict(list)
    for monomer_position, monomer in polymer_info.items():
        token_position = monomer.GetIntProp(TOKEN_POSITION_PROP_NAME)
        monomer_token_lookup[token_position].append(monomer)

    all_monomers = [None] * len(monomer_token_lookup)
    for token_position, monomers in monomer_token_lookup.items():
        monomers = sorted(monomers,
                          key=lambda x: x.GetIntProp(MONOMER_INDEX_PROP_NAME))
        monomer_helm = (
            _construct_monomer_helm(monomer) for monomer in monomers)
        all_monomers[token_position - 1] = "".join(monomer_helm)

    monomers_helm = ".".join(all_monomers)
    polymer_helm = f"{polymer_id}{{{monomers_helm}}}"
    return polymer_helm


def _cg_mol_to_helm2(cg_mol):
    polymers = (_construct_polymer_helm(polymer_id, polymer_info)
                for polymer_id, polymer_info in _get_polymers_info(cg_mol))
    polymers_helm = "|".join(polymers)

    connections = (_construct_connection_helm(bond)
                   for bond in cg_mol.GetBonds()
                   if bond.HasProp(CONNECTION_ATTACHMENT_POINTS_PROP_NAME))
    connections_helm = "|".join(connections)

    return f"{polymers_helm}${connections_helm}$$$V2.0"