Source code for schrodinger.protein.helm

"""
A module to parse convert biomolecules to and from HELM strings.

The Hierarchical Editing Language for Macromolecules(HELM) was designed to
create a single notation that can encode the structure of all biomolecules.

HELM encodes different types of biomolecules, including XNA, peptides, and Chem
polymers, but also allows users to defined custom polymer types.

The HELM specification can be found at
https://pistoiaalliance.atlassian.net/wiki/spaces/PUB/pages/13795362/HELM+Notation
"""
from rdkit import Chem

from collections import defaultdict
from dataclasses import dataclass
import re

MONOMER_INDEX_PROP_NAME = "_monomerIndex"
ATOM_LABEL_PROP_NAME = "atomLabel"
MOL_LABEL_PROP_NAME = "_name"
TOKEN_POSITION_PROP_NAME = "tokenPosition"
IS_BRANCH_MONOMER_PROP_NAME = "isBranchMonomer"
CONNECTION_ATTACHMENT_POINTS_PROP_NAME = "attachmentPoints"

SUPPORTED_POLYMERS = {"PEPTIDE", "DNA", "RNA"}

POLYMER_ID_PATTERN = re.compile(
    r"""^
        ([A-Z]+) # the polymer type
        [1-9]\d* # the polymer number >= 1
        $
        """, re.X)
SIMPLE_POLYMER_PATTERN = re.compile(
    r"""^
        ([^\}]+) # simple polymer token cannot contain '}' character
        (\"[^$]+\")? # inline annotation eg. "mutant"
        $
        """, re.X)
# SIMPLE_POLYMER_PATTERN(|SIMPLE_POLYMER_PATTERN)*
SIMPLE_POLYMERS_SECTION_PATTERN = re.compile(
    r"""^
         ([A-Z]+\d+)\{(.+)\}(\".+\")? # simple polymer pattern
         (|([A-Z]+\d+)\{(.+)\}(\".+\")?)* #repeated simple polymers separated by '|'
         $
         """, re.X)

# example RNA1,CHEM1,21:R2‐1:R1
CONNECTION_PATTERN = re.compile(
    r"""^
        (?P<from_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE1,
        (?P<to_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE2,
        (?P<from_monomer_position>[1-9]\d*) # eg. 21
        \:(?P<from_attachment_point>R[1-9]\d*)\- # attachment point eg. R2
        (?P<to_monomer_position>[1-9]\d*) # eg. 23
        \:(?P<to_attachment_point>R[1-9]\d*) # eg. R1
        $
        """, re.X)
# example RNA1,CHEM1,21:pair‐1:pair
HYDROGEN_BOND_PATTERN = re.compile(
    r"""^
       (?P<from_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE1,
       (?P<to_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE2,
       (?P<from_monomer_position>[1-9]\d*) # eg. 21
       \:(?P<from_attachment_point>pair)\-
       (?P<to_monomer_position>[1-9]\d*) # eg. 23
       \:(?P<to_attachment_point>pair)
       $
       """, re.X)


[docs]@dataclass class Monomer: position_in_polymer: int position_in_token: int name: str is_branch: bool
[docs]def to_rdkit(helm: str) -> Chem.Mol: """ Converts a helm string to a residue-level ROMol """ helm_version = helm[helm.rfind("$") + 1:] if not helm_version: helm2 = convert_helm1_to_helm2(helm) return _cg_mol_from_helm2(helm2) elif helm_version.upper() == "V2.0": return _cg_mol_from_helm2(helm) raise ValueError("Only HELM and HELMV2.0 are supported.")
[docs]def to_helm(cg_mol: Chem.Mol) -> str: """ Converts a coarse-grain residue-level ROMol to a helm string """ return _cg_mol_to_helm2(cg_mol)
def _validate_polymer_id(polymer_id: str) -> None: """ a valid polymer id should have a polymer type in caps followed by a number. Eg., PEPTIDE1 :raises ValueError: If the polymer id doesn't match the HELM specification. :raises ValueError: If the polymer type is unsupported. """ polymer_id_match = POLYMER_ID_PATTERN.match(polymer_id) if not polymer_id_match: raise ValueError(f"The polymer id '{polymer_id}' is not valid.") polymer_type = polymer_id_match.group(1) if polymer_type not in SUPPORTED_POLYMERS: raise ValueError( f"The polymer type '{polymer_type}' is not currently supported.") def _validate_connection_token(connection_token): """ Helper function to validate a connection token. A valid connection token should be of the form SourcePolymerID,TargetPolymerID,SourceMonomerPosition:SourceAttachmentPoint-TargetMonomerPosition:TargetAttachmentPoint :raises ValueError: If the connection token does not match the HELM spcification. """ connection_match = CONNECTION_PATTERN.match(connection_token) if connection_match is not None: return hydrogen_bond_match = re.match(HYDROGEN_BOND_PATTERN, connection_token) if hydrogen_bond_match is not None: return raise ValueError(f"Invalid connection: {connection_token}.") def _validate_simple_polymers_section(simple_polymers_section): """ Helper function to validate the simple polymers HELM section. :raises ValueError: If the simple polymers section token doesn't match the HELM specification. """ if re.search(SIMPLE_POLYMERS_SECTION_PATTERN, simple_polymers_section) is None: raise ValueError(f"Invalid simple polymers: {simple_polymers_section}.") def _validate_simple_polymer(simple_polymer): """ Helper function to validate a simple polymer token. :raises ValueError: If the simple polymer token doesn't match the HELM specification. """ if re.search(SIMPLE_POLYMER_PATTERN, simple_polymer) is None: raise ValueError(f"Invalid simple polymer: {simple_polymer}.")
[docs]def convert_helm1_to_helm2(helm1: str) -> str: """ Helper function to convert helm1 to helm2 The helm specifications changed from simple polymers$connections$H-bonds$annotations$ to simple polymers$connections(including H-bonds)$polymer groups$annotations$V2.0 """ converted_helm2_sections = [""] * 4 # since the first section is still legal, we just add it sections = helm1.split("}$") if len(sections) == 1: raise ValueError("Invalid simple polymers") converted_helm2_sections[0] = sections[0] + "}" remaining_helm1 = "}$".join(sections[1:]) remaining_sections = remaining_helm1.split("$") # add information about connections connections = [remaining_sections[0]] # if join the connections section to the hydrogen bonds section if remaining_sections[1]: connections.append(remaining_sections[1]) converted_helm2_sections[1] = "|".join(connections) # add the remaining section and the version info converted_helm2_sections[3] = remaining_sections[2] return "$".join(converted_helm2_sections) + "$V2.0"
def _get_multiple_character_monomer_token(simple_polymer_iterator): """ Since a multi character token is surrounded by square brackets, we need to keep track of the depth of square brackets in the token, and return the token. :param simple_polymer_iterator: a string iterator on the current polymer Example: input: N[C@H](C)C(=O)O].A.P output: N[C@H](C)C(=O)O """ depth = 1 current_monomer = "" for char in simple_polymer_iterator: if char == "[": depth += 1 elif char == "]": depth -= 1 # if we find a valid token, e.g. [token], we need to yield it and # release ownership of the iterator if depth == 0: return current_monomer current_monomer += char def _get_monomers(simple_polymer: str) -> Monomer: """ Returns the monomer position within the polymer, monomer position within the containing token, the monomer name, and whether the monomer is a branch monomer Monomer tokens are defined as dot-separated values within the curly braces of a simple polymer. A valid monomer tokens can be either a single capitalized letter of the alphabet, or multiple characters surrounded by square brackets. A multiple character monomer token may be a SMILES string or a branched monomer token , i.e. multiple monomers in one monomer token with one monomer surrounded by brackets. Currently not supporting ambiguous monomers. Example: input: A.A(dF)P output: Monomer(1, 1, A, False), Monomer(2, 1, A, False), Monomer(3, 2, dF, True), Monomer(4, 3, P, False) """ current_monomer = "" simple_polymer = iter(simple_polymer) # monomer/ token positions are synonymous to residue numbers and are 1-based monomer_position = 1 token_position = 1 is_branch_monomer = False for char in simple_polymer: if char == ".": if current_monomer: yield Monomer(monomer_position, token_position, current_monomer, is_branch_monomer) monomer_position += 1 current_monomer = "" token_position += 1 elif char == "[": if current_monomer: yield Monomer(monomer_position, token_position, current_monomer, is_branch_monomer) monomer_position += 1 current_monomer = "" multiple_character_monomer_token = _get_multiple_character_monomer_token( simple_polymer) if multiple_character_monomer_token is not None: yield Monomer(monomer_position, token_position, multiple_character_monomer_token, is_branch_monomer) monomer_position += 1 elif char == "(": if current_monomer: yield Monomer(monomer_position, token_position, current_monomer, is_branch_monomer) monomer_position += 1 current_monomer = "" is_branch_monomer = True elif char == ")": if current_monomer: yield Monomer(monomer_position, token_position, current_monomer, is_branch_monomer) monomer_position += 1 current_monomer = "" is_branch_monomer = False else: current_monomer += char if current_monomer: yield Monomer(monomer_position, token_position, current_monomer, is_branch_monomer) def _get_connections(connection_section: str) -> list: """ Returns valid connection tokens found in the connections section of a HELM string Currently not supporting ambiguous connections. Example: input: "RNA1,CHEM1,2:R1-1:R2|RNA1,CHEM1,20:R-1:R2" output: ["RNA1,CHEM1,2:R1-1:R2", "RNA1,CHEM1,20:R-1:R2"] """ if not connection_section: return [] for connection_token in connection_section.split("|"): _validate_connection_token(connection_token) yield connection_token def _process_simple_polymer(simple_polymer: str, polymer_id: str) -> Chem.Mol: """ Returns a residue-level Chem.Mol object Example: input: A.A.A.A.[dF] output: Chem.Mol object (A-A-A-A-dF) """ rwmol = Chem.RWMol() rwmol.BeginBatchEdit() prev_atom_idx = -1 for monomer in _get_monomers(simple_polymer): new_atom_idx = rwmol.AddAtom(Chem.Atom(0)) new_atom = rwmol.GetAtomWithIdx(new_atom_idx) new_atom.SetIntProp(MONOMER_INDEX_PROP_NAME, monomer.position_in_polymer) new_atom.SetProp(ATOM_LABEL_PROP_NAME, monomer.name) new_atom.SetProp(MOL_LABEL_PROP_NAME, polymer_id) new_atom.SetIntProp(TOKEN_POSITION_PROP_NAME, monomer.position_in_token) new_atom.SetBoolProp(IS_BRANCH_MONOMER_PROP_NAME, monomer.is_branch) if prev_atom_idx == -1: prev_atom_idx = new_atom_idx continue rwmol.AddBond(prev_atom_idx, new_atom_idx, Chem.BondType.SINGLE) if not monomer.is_branch: prev_atom_idx = new_atom_idx rwmol.CommitBatchEdit() return Chem.Mol(rwmol) def _get_simple_polymers(simple_polymers: str) -> (str, str): """ Returns each valid simple polymer as a separate token from the simple polymers string. :param simple_polymers: the first section of a HELM2 string Example: input: PEPTIDE1{A.A}|PEPTIDE2{G.G} output: (PEPTIDE1, A.A), (PEPTIDE2, G.G) """ seen_simple_polymers = set() current_polymer_id = [] i = 0 num_chars = len(simple_polymers) while i < num_chars: char = simple_polymers[i] if char == "{": current_polymer_id = "".join(current_polymer_id) if current_polymer_id in seen_simple_polymers: raise ValueError( f"The polymer id, '{current_polymer_id}' is not unique.") _validate_polymer_id(current_polymer_id) # we need to get the information inside the '{' and '}' tokens i += 1 token_start_index = i while (i < num_chars) and (simple_polymers[i] != "}"): i += 1 polymer_token = simple_polymers[token_start_index:i] _validate_simple_polymer(polymer_token) yield current_polymer_id, polymer_token seen_simple_polymers.add(current_polymer_id) current_polymer_id = [] continue elif char == "}" or char == "|": current_polymer_id = [] else: current_polymer_id.append(char) i += 1 def _process_simple_polymers(simple_polymers: str) -> Chem.Mol: """ Processes all polymer tokens in a first section of a HELM2 string and returns them as a residue-level Chem.Mol object Example: input: PEPTIDE1{A.A.A} output: Chem.Mol object (A-A-A) """ _validate_simple_polymers_section(simple_polymers) for polymer_id, polymer_token in _get_simple_polymers(simple_polymers): simple_polymer = _process_simple_polymer(polymer_token, polymer_id) yield simple_polymer def _get_helm2_sections(helm2: str) -> (int, str): """ Returns the simple polymers, connections, polymer group, and annotations sections of a HELM2 string as separate tokens. :param helm2: a valid HELM2 string Example: input: PEPTIDE1{A.A.A}$$$$V2.0 output: (1, PEPTIDE1{A.A.A}), (2,""), (3, ""), (4, "") """ current_helm_section = 1 helm_section_token = [] for char in helm2: # we're done processing the relevant sections if current_helm_section > 4: return if char == "$": # '$' can be present in cxxsmiles in first section so, we need to handle that if current_helm_section == 1 and helm_section_token[-1] != "}": helm_section_token.append(char) continue yield current_helm_section, "".join(helm_section_token) current_helm_section += 1 helm_section_token = [] else: helm_section_token.append(char) def _build_connections(connection_tokens: list, cg_mol: Chem.Mol) -> Chem.Mol: """ Takes connection information and create bonds between atoms that should be connected. """ connection_atoms = set() connections = {} for connection_token in connection_tokens: connection_match = CONNECTION_PATTERN.match(connection_token) if connection_match is None: connection_match = HYDROGEN_BOND_PATTERN.match(connection_token) from_polymer = connection_match.group("from_polymer_id") to_polymer = connection_match.group("to_polymer_id") from_monomer_position = connection_match.group("from_monomer_position") to_monomer_position = connection_match.group("to_monomer_position") from_attachment_point = connection_match.group("from_attachment_point") to_attachment_point = connection_match.group("to_attachment_point") from_monomer_key = f"{from_polymer}_{from_monomer_position}" to_monomer_key = f"{to_polymer}_{to_monomer_position}" connection_atoms.add(from_monomer_key) connection_atoms.add(to_monomer_key) connections[from_monomer_key] = { "to_monomer": to_monomer_key, "from_attachment_point": from_attachment_point, "to_attachment_point": to_attachment_point } connection_atoms_lookup = {} for atom in cg_mol.GetAtoms(): monomer_key = f"{atom.GetProp(MOL_LABEL_PROP_NAME)}_{atom.GetProp(MONOMER_INDEX_PROP_NAME)}" if monomer_key in connection_atoms: connection_atoms_lookup[monomer_key] = atom # verify that the all connection atoms can be found if not all([ connection_atoms_lookup.get(connection_atom, None) for connection_atom in connection_atoms ]): raise ValueError( "Some atoms specified in the connections section must be couldn't be located." ) rwmol = Chem.RWMol(cg_mol) rwmol.BeginBatchEdit() for key, connection_info in connections.items(): from_attachment_point = connection_info["from_attachment_point"] to_attachment_point = connection_info["to_attachment_point"] bond_type = Chem.BondType.ZERO if from_attachment_point == "pair" else Chem.BondType.SINGLE from_atom = connection_atoms_lookup[key] to_atom = connection_atoms_lookup[connection_info["to_monomer"]] rwmol.AddBond(from_atom.GetIdx(), to_atom.GetIdx(), bond_type) new_bond = rwmol.GetBondBetweenAtoms(from_atom.GetIdx(), to_atom.GetIdx()) new_bond.SetProp(CONNECTION_ATTACHMENT_POINTS_PROP_NAME, f"{from_attachment_point}-{to_attachment_point}") rwmol.CommitBatchEdit() return Chem.Mol(rwmol) def _cg_mol_from_helm2(helm2): simple_polymers_section, connections_section, *_ = [ section_token for _, section_token in _get_helm2_sections(helm2) ] cg_mol = Chem.Mol() for mol in _process_simple_polymers(simple_polymers_section): cg_mol = Chem.CombineMols(cg_mol, mol) connection_tokens = list(_get_connections(connections_section)) if connection_tokens: cg_mol = _build_connections(connection_tokens, cg_mol) return cg_mol def _get_polymers_info(cg_mol): """ Given a coarse-grain ROMol encoding a residue-level biomolecule, returns a polymer id and a dictionary with the keys as monomer positions and values as the associated ROMol object. """ polymer_info_lookup = defaultdict(dict) for atom in cg_mol.GetAtoms(): polymer_id = atom.GetProp(MOL_LABEL_PROP_NAME) monomer_position = int(atom.GetProp(MONOMER_INDEX_PROP_NAME)) polymer_info_lookup[polymer_id][monomer_position] = atom for polymer_id, polymer_info in polymer_info_lookup.items(): yield polymer_id, polymer_info def _construct_monomer_helm(monomer): monomer_helm = monomer.GetProp(ATOM_LABEL_PROP_NAME) if len(monomer_helm) > 1: monomer_helm = f"[{monomer_helm}]" if monomer.GetBoolProp(IS_BRANCH_MONOMER_PROP_NAME): monomer_helm = f"({monomer_helm})" return monomer_helm def _construct_connection_helm(bond): from_monomer = bond.GetBeginAtom() to_monomer = bond.GetEndAtom() from_polymer_id = from_monomer.GetProp(MOL_LABEL_PROP_NAME) to_polymer_id = to_monomer.GetProp(MOL_LABEL_PROP_NAME) from_monomer_position = from_monomer.GetIntProp(MONOMER_INDEX_PROP_NAME) to_monomer_position = to_monomer.GetIntProp(MONOMER_INDEX_PROP_NAME) from_attachment_point, to_attachment_point = bond.GetProp( CONNECTION_ATTACHMENT_POINTS_PROP_NAME).split("-") connection_helm = f"{from_polymer_id},{to_polymer_id},{from_monomer_position}:{from_attachment_point}-{to_monomer_position}:{to_attachment_point}" return connection_helm def _construct_polymer_helm(polymer_id, polymer_info): """ Given a polymer id and a dictionary mapping monomer positions to the associated ROMol object, constructs and returns a HELM2 representation of the polymer. """ num_monomers = len(polymer_info) monomer_token_lookup = defaultdict(list) for monomer_position, monomer in polymer_info.items(): token_position = monomer.GetIntProp(TOKEN_POSITION_PROP_NAME) monomer_token_lookup[token_position].append(monomer) all_monomers = [None] * len(monomer_token_lookup) for token_position, monomers in monomer_token_lookup.items(): monomers = sorted(monomers, key=lambda x: x.GetIntProp(MONOMER_INDEX_PROP_NAME)) monomer_helm = ( _construct_monomer_helm(monomer) for monomer in monomers) all_monomers[token_position - 1] = "".join(monomer_helm) monomers_helm = ".".join(all_monomers) polymer_helm = f"{polymer_id}{{{monomers_helm}}}" return polymer_helm def _cg_mol_to_helm2(cg_mol): polymers = (_construct_polymer_helm(polymer_id, polymer_info) for polymer_id, polymer_info in _get_polymers_info(cg_mol)) polymers_helm = "|".join(polymers) connections = (_construct_connection_helm(bond) for bond in cg_mol.GetBonds() if bond.HasProp(CONNECTION_ATTACHMENT_POINTS_PROP_NAME)) connections_helm = "|".join(connections) return f"{polymers_helm}${connections_helm}$$$V2.0"