"""
A module to parse convert biomolecules to and from HELM strings.
The Hierarchical Editing Language for Macromolecules(HELM) was designed to
create a single notation that can encode the structure of all biomolecules.
HELM encodes different types of biomolecules, including XNA, peptides, and Chem
polymers, but also allows users to defined custom polymer types.
The HELM specification can be found at
https://pistoiaalliance.atlassian.net/wiki/spaces/PUB/pages/13795362/HELM+Notation
"""
from rdkit import Chem
from collections import defaultdict
from dataclasses import dataclass
import re
MONOMER_INDEX_PROP_NAME = "_monomerIndex"
ATOM_LABEL_PROP_NAME = "atomLabel"
MOL_LABEL_PROP_NAME = "_name"
TOKEN_POSITION_PROP_NAME = "tokenPosition"
IS_BRANCH_MONOMER_PROP_NAME = "isBranchMonomer"
CONNECTION_ATTACHMENT_POINTS_PROP_NAME = "attachmentPoints"
SUPPORTED_POLYMERS = {"PEPTIDE", "DNA", "RNA"}
POLYMER_ID_PATTERN = re.compile(
r"""^
([A-Z]+) # the polymer type
[1-9]\d* # the polymer number >= 1
$
""", re.X)
SIMPLE_POLYMER_PATTERN = re.compile(
r"""^
([^\}]+) # simple polymer token cannot contain '}' character
(\"[^$]+\")? # inline annotation eg. "mutant"
$
""", re.X)
# SIMPLE_POLYMER_PATTERN(|SIMPLE_POLYMER_PATTERN)*
SIMPLE_POLYMERS_SECTION_PATTERN = re.compile(
r"""^
([A-Z]+\d+)\{(.+)\}(\".+\")? # simple polymer pattern
(|([A-Z]+\d+)\{(.+)\}(\".+\")?)* #repeated simple polymers separated by '|'
$
""", re.X)
# example RNA1,CHEM1,21:R2‐1:R1
CONNECTION_PATTERN = re.compile(
r"""^
(?P<from_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE1,
(?P<to_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE2,
(?P<from_monomer_position>[1-9]\d*) # eg. 21
\:(?P<from_attachment_point>R[1-9]\d*)\- # attachment point eg. R2
(?P<to_monomer_position>[1-9]\d*) # eg. 23
\:(?P<to_attachment_point>R[1-9]\d*) # eg. R1
$
""", re.X)
# example RNA1,CHEM1,21:pair‐1:pair
HYDROGEN_BOND_PATTERN = re.compile(
r"""^
(?P<from_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE1,
(?P<to_polymer_id>[A-Z]+[1-9]\d*), # eg. PEPTIDE2,
(?P<from_monomer_position>[1-9]\d*) # eg. 21
\:(?P<from_attachment_point>pair)\-
(?P<to_monomer_position>[1-9]\d*) # eg. 23
\:(?P<to_attachment_point>pair)
$
""", re.X)
[docs]@dataclass
class Monomer:
position_in_polymer: int
position_in_token: int
name: str
is_branch: bool
[docs]def to_rdkit(helm: str) -> Chem.Mol:
"""
Converts a helm string to a residue-level ROMol
"""
helm_version = helm[helm.rfind("$") + 1:]
if not helm_version:
helm2 = convert_helm1_to_helm2(helm)
return _cg_mol_from_helm2(helm2)
elif helm_version.upper() == "V2.0":
return _cg_mol_from_helm2(helm)
raise ValueError("Only HELM and HELMV2.0 are supported.")
[docs]def to_helm(cg_mol: Chem.Mol) -> str:
"""
Converts a coarse-grain residue-level ROMol to a helm string
"""
return _cg_mol_to_helm2(cg_mol)
def _validate_polymer_id(polymer_id: str) -> None:
"""
a valid polymer id should have a polymer type in caps followed by a
number. Eg., PEPTIDE1
:raises ValueError: If the polymer id doesn't match the HELM specification.
:raises ValueError: If the polymer type is unsupported.
"""
polymer_id_match = POLYMER_ID_PATTERN.match(polymer_id)
if not polymer_id_match:
raise ValueError(f"The polymer id '{polymer_id}' is not valid.")
polymer_type = polymer_id_match.group(1)
if polymer_type not in SUPPORTED_POLYMERS:
raise ValueError(
f"The polymer type '{polymer_type}' is not currently supported.")
def _validate_connection_token(connection_token):
"""
Helper function to validate a connection token.
A valid connection token should be of the form
SourcePolymerID,TargetPolymerID,SourceMonomerPosition:SourceAttachmentPoint-TargetMonomerPosition:TargetAttachmentPoint
:raises ValueError: If the connection token does not match the HELM spcification.
"""
connection_match = CONNECTION_PATTERN.match(connection_token)
if connection_match is not None:
return
hydrogen_bond_match = re.match(HYDROGEN_BOND_PATTERN, connection_token)
if hydrogen_bond_match is not None:
return
raise ValueError(f"Invalid connection: {connection_token}.")
def _validate_simple_polymers_section(simple_polymers_section):
"""
Helper function to validate the simple polymers HELM section.
:raises ValueError: If the simple polymers section token doesn't match the
HELM specification.
"""
if re.search(SIMPLE_POLYMERS_SECTION_PATTERN,
simple_polymers_section) is None:
raise ValueError(f"Invalid simple polymers: {simple_polymers_section}.")
def _validate_simple_polymer(simple_polymer):
"""
Helper function to validate a simple polymer token.
:raises ValueError: If the simple polymer token doesn't match the HELM
specification.
"""
if re.search(SIMPLE_POLYMER_PATTERN, simple_polymer) is None:
raise ValueError(f"Invalid simple polymer: {simple_polymer}.")
[docs]def convert_helm1_to_helm2(helm1: str) -> str:
"""
Helper function to convert helm1 to helm2
The helm specifications changed from
simple polymers$connections$H-bonds$annotations$
to
simple polymers$connections(including H-bonds)$polymer groups$annotations$V2.0
"""
converted_helm2_sections = [""] * 4
# since the first section is still legal, we just add it
sections = helm1.split("}$")
if len(sections) == 1:
raise ValueError("Invalid simple polymers")
converted_helm2_sections[0] = sections[0] + "}"
remaining_helm1 = "}$".join(sections[1:])
remaining_sections = remaining_helm1.split("$")
# add information about connections
connections = [remaining_sections[0]]
# if join the connections section to the hydrogen bonds section
if remaining_sections[1]:
connections.append(remaining_sections[1])
converted_helm2_sections[1] = "|".join(connections)
# add the remaining section and the version info
converted_helm2_sections[3] = remaining_sections[2]
return "$".join(converted_helm2_sections) + "$V2.0"
def _get_multiple_character_monomer_token(simple_polymer_iterator):
"""
Since a multi character token is surrounded by square brackets, we need to
keep track of the depth of square brackets in the token, and return the token.
:param simple_polymer_iterator: a string iterator on the current polymer
Example:
input: N[C@H](C)C(=O)O].A.P
output: N[C@H](C)C(=O)O
"""
depth = 1
current_monomer = ""
for char in simple_polymer_iterator:
if char == "[":
depth += 1
elif char == "]":
depth -= 1
# if we find a valid token, e.g. [token], we need to yield it and
# release ownership of the iterator
if depth == 0:
return current_monomer
current_monomer += char
def _get_monomers(simple_polymer: str) -> Monomer:
"""
Returns the monomer position within the polymer, monomer position within the
containing token, the monomer name, and whether the monomer is a branch monomer
Monomer tokens are defined as dot-separated values within the curly braces
of a simple polymer. A valid monomer tokens can be either a single
capitalized letter of the alphabet, or multiple characters surrounded
by square brackets. A multiple character monomer token may be a SMILES
string or a branched monomer token , i.e. multiple monomers in one monomer
token with one monomer surrounded by brackets.
Currently not supporting ambiguous monomers.
Example:
input: A.A(dF)P
output: Monomer(1, 1, A, False), Monomer(2, 1, A, False), Monomer(3, 2, dF, True), Monomer(4, 3, P, False)
"""
current_monomer = ""
simple_polymer = iter(simple_polymer)
# monomer/ token positions are synonymous to residue numbers and are 1-based
monomer_position = 1
token_position = 1
is_branch_monomer = False
for char in simple_polymer:
if char == ".":
if current_monomer:
yield Monomer(monomer_position, token_position, current_monomer,
is_branch_monomer)
monomer_position += 1
current_monomer = ""
token_position += 1
elif char == "[":
if current_monomer:
yield Monomer(monomer_position, token_position, current_monomer,
is_branch_monomer)
monomer_position += 1
current_monomer = ""
multiple_character_monomer_token = _get_multiple_character_monomer_token(
simple_polymer)
if multiple_character_monomer_token is not None:
yield Monomer(monomer_position, token_position,
multiple_character_monomer_token,
is_branch_monomer)
monomer_position += 1
elif char == "(":
if current_monomer:
yield Monomer(monomer_position, token_position, current_monomer,
is_branch_monomer)
monomer_position += 1
current_monomer = ""
is_branch_monomer = True
elif char == ")":
if current_monomer:
yield Monomer(monomer_position, token_position, current_monomer,
is_branch_monomer)
monomer_position += 1
current_monomer = ""
is_branch_monomer = False
else:
current_monomer += char
if current_monomer:
yield Monomer(monomer_position, token_position, current_monomer,
is_branch_monomer)
def _get_connections(connection_section: str) -> list:
"""
Returns valid connection tokens found in the connections section of a
HELM string
Currently not supporting ambiguous connections.
Example:
input: "RNA1,CHEM1,2:R1-1:R2|RNA1,CHEM1,20:R-1:R2"
output: ["RNA1,CHEM1,2:R1-1:R2", "RNA1,CHEM1,20:R-1:R2"]
"""
if not connection_section:
return []
for connection_token in connection_section.split("|"):
_validate_connection_token(connection_token)
yield connection_token
def _process_simple_polymer(simple_polymer: str, polymer_id: str) -> Chem.Mol:
"""
Returns a residue-level Chem.Mol object
Example:
input: A.A.A.A.[dF]
output: Chem.Mol object (A-A-A-A-dF)
"""
rwmol = Chem.RWMol()
rwmol.BeginBatchEdit()
prev_atom_idx = -1
for monomer in _get_monomers(simple_polymer):
new_atom_idx = rwmol.AddAtom(Chem.Atom(0))
new_atom = rwmol.GetAtomWithIdx(new_atom_idx)
new_atom.SetIntProp(MONOMER_INDEX_PROP_NAME,
monomer.position_in_polymer)
new_atom.SetProp(ATOM_LABEL_PROP_NAME, monomer.name)
new_atom.SetProp(MOL_LABEL_PROP_NAME, polymer_id)
new_atom.SetIntProp(TOKEN_POSITION_PROP_NAME, monomer.position_in_token)
new_atom.SetBoolProp(IS_BRANCH_MONOMER_PROP_NAME, monomer.is_branch)
if prev_atom_idx == -1:
prev_atom_idx = new_atom_idx
continue
rwmol.AddBond(prev_atom_idx, new_atom_idx, Chem.BondType.SINGLE)
if not monomer.is_branch:
prev_atom_idx = new_atom_idx
rwmol.CommitBatchEdit()
return Chem.Mol(rwmol)
def _get_simple_polymers(simple_polymers: str) -> (str, str):
"""
Returns each valid simple polymer as a separate token from the simple
polymers string.
:param simple_polymers: the first section of a HELM2 string
Example:
input: PEPTIDE1{A.A}|PEPTIDE2{G.G}
output: (PEPTIDE1, A.A), (PEPTIDE2, G.G)
"""
seen_simple_polymers = set()
current_polymer_id = []
i = 0
num_chars = len(simple_polymers)
while i < num_chars:
char = simple_polymers[i]
if char == "{":
current_polymer_id = "".join(current_polymer_id)
if current_polymer_id in seen_simple_polymers:
raise ValueError(
f"The polymer id, '{current_polymer_id}' is not unique.")
_validate_polymer_id(current_polymer_id)
# we need to get the information inside the '{' and '}' tokens
i += 1
token_start_index = i
while (i < num_chars) and (simple_polymers[i] != "}"):
i += 1
polymer_token = simple_polymers[token_start_index:i]
_validate_simple_polymer(polymer_token)
yield current_polymer_id, polymer_token
seen_simple_polymers.add(current_polymer_id)
current_polymer_id = []
continue
elif char == "}" or char == "|":
current_polymer_id = []
else:
current_polymer_id.append(char)
i += 1
def _process_simple_polymers(simple_polymers: str) -> Chem.Mol:
"""
Processes all polymer tokens in a first section of a HELM2 string
and returns them as a residue-level Chem.Mol object
Example:
input: PEPTIDE1{A.A.A}
output: Chem.Mol object (A-A-A)
"""
_validate_simple_polymers_section(simple_polymers)
for polymer_id, polymer_token in _get_simple_polymers(simple_polymers):
simple_polymer = _process_simple_polymer(polymer_token, polymer_id)
yield simple_polymer
def _get_helm2_sections(helm2: str) -> (int, str):
"""
Returns the simple polymers, connections, polymer group, and annotations
sections of a HELM2 string as separate tokens.
:param helm2: a valid HELM2 string
Example:
input: PEPTIDE1{A.A.A}$$$$V2.0
output: (1, PEPTIDE1{A.A.A}), (2,""), (3, ""), (4, "")
"""
current_helm_section = 1
helm_section_token = []
for char in helm2:
# we're done processing the relevant sections
if current_helm_section > 4:
return
if char == "$":
# '$' can be present in cxxsmiles in first section so, we need to handle that
if current_helm_section == 1 and helm_section_token[-1] != "}":
helm_section_token.append(char)
continue
yield current_helm_section, "".join(helm_section_token)
current_helm_section += 1
helm_section_token = []
else:
helm_section_token.append(char)
def _build_connections(connection_tokens: list, cg_mol: Chem.Mol) -> Chem.Mol:
"""
Takes connection information and create bonds between atoms that should be
connected.
"""
connection_atoms = set()
connections = {}
for connection_token in connection_tokens:
connection_match = CONNECTION_PATTERN.match(connection_token)
if connection_match is None:
connection_match = HYDROGEN_BOND_PATTERN.match(connection_token)
from_polymer = connection_match.group("from_polymer_id")
to_polymer = connection_match.group("to_polymer_id")
from_monomer_position = connection_match.group("from_monomer_position")
to_monomer_position = connection_match.group("to_monomer_position")
from_attachment_point = connection_match.group("from_attachment_point")
to_attachment_point = connection_match.group("to_attachment_point")
from_monomer_key = f"{from_polymer}_{from_monomer_position}"
to_monomer_key = f"{to_polymer}_{to_monomer_position}"
connection_atoms.add(from_monomer_key)
connection_atoms.add(to_monomer_key)
connections[from_monomer_key] = {
"to_monomer": to_monomer_key,
"from_attachment_point": from_attachment_point,
"to_attachment_point": to_attachment_point
}
connection_atoms_lookup = {}
for atom in cg_mol.GetAtoms():
monomer_key = f"{atom.GetProp(MOL_LABEL_PROP_NAME)}_{atom.GetProp(MONOMER_INDEX_PROP_NAME)}"
if monomer_key in connection_atoms:
connection_atoms_lookup[monomer_key] = atom
# verify that the all connection atoms can be found
if not all([
connection_atoms_lookup.get(connection_atom, None)
for connection_atom in connection_atoms
]):
raise ValueError(
"Some atoms specified in the connections section must be couldn't be located."
)
rwmol = Chem.RWMol(cg_mol)
rwmol.BeginBatchEdit()
for key, connection_info in connections.items():
from_attachment_point = connection_info["from_attachment_point"]
to_attachment_point = connection_info["to_attachment_point"]
bond_type = Chem.BondType.ZERO if from_attachment_point == "pair" else Chem.BondType.SINGLE
from_atom = connection_atoms_lookup[key]
to_atom = connection_atoms_lookup[connection_info["to_monomer"]]
rwmol.AddBond(from_atom.GetIdx(), to_atom.GetIdx(), bond_type)
new_bond = rwmol.GetBondBetweenAtoms(from_atom.GetIdx(),
to_atom.GetIdx())
new_bond.SetProp(CONNECTION_ATTACHMENT_POINTS_PROP_NAME,
f"{from_attachment_point}-{to_attachment_point}")
rwmol.CommitBatchEdit()
return Chem.Mol(rwmol)
def _cg_mol_from_helm2(helm2):
simple_polymers_section, connections_section, *_ = [
section_token for _, section_token in _get_helm2_sections(helm2)
]
cg_mol = Chem.Mol()
for mol in _process_simple_polymers(simple_polymers_section):
cg_mol = Chem.CombineMols(cg_mol, mol)
connection_tokens = list(_get_connections(connections_section))
if connection_tokens:
cg_mol = _build_connections(connection_tokens, cg_mol)
return cg_mol
def _get_polymers_info(cg_mol):
"""
Given a coarse-grain ROMol encoding a residue-level biomolecule, returns
a polymer id and a dictionary with the keys as monomer positions and values
as the associated ROMol object.
"""
polymer_info_lookup = defaultdict(dict)
for atom in cg_mol.GetAtoms():
polymer_id = atom.GetProp(MOL_LABEL_PROP_NAME)
monomer_position = int(atom.GetProp(MONOMER_INDEX_PROP_NAME))
polymer_info_lookup[polymer_id][monomer_position] = atom
for polymer_id, polymer_info in polymer_info_lookup.items():
yield polymer_id, polymer_info
def _construct_monomer_helm(monomer):
monomer_helm = monomer.GetProp(ATOM_LABEL_PROP_NAME)
if len(monomer_helm) > 1:
monomer_helm = f"[{monomer_helm}]"
if monomer.GetBoolProp(IS_BRANCH_MONOMER_PROP_NAME):
monomer_helm = f"({monomer_helm})"
return monomer_helm
def _construct_connection_helm(bond):
from_monomer = bond.GetBeginAtom()
to_monomer = bond.GetEndAtom()
from_polymer_id = from_monomer.GetProp(MOL_LABEL_PROP_NAME)
to_polymer_id = to_monomer.GetProp(MOL_LABEL_PROP_NAME)
from_monomer_position = from_monomer.GetIntProp(MONOMER_INDEX_PROP_NAME)
to_monomer_position = to_monomer.GetIntProp(MONOMER_INDEX_PROP_NAME)
from_attachment_point, to_attachment_point = bond.GetProp(
CONNECTION_ATTACHMENT_POINTS_PROP_NAME).split("-")
connection_helm = f"{from_polymer_id},{to_polymer_id},{from_monomer_position}:{from_attachment_point}-{to_monomer_position}:{to_attachment_point}"
return connection_helm
def _construct_polymer_helm(polymer_id, polymer_info):
"""
Given a polymer id and a dictionary mapping monomer positions to the
associated ROMol object, constructs and returns a HELM2 representation
of the polymer.
"""
num_monomers = len(polymer_info)
monomer_token_lookup = defaultdict(list)
for monomer_position, monomer in polymer_info.items():
token_position = monomer.GetIntProp(TOKEN_POSITION_PROP_NAME)
monomer_token_lookup[token_position].append(monomer)
all_monomers = [None] * len(monomer_token_lookup)
for token_position, monomers in monomer_token_lookup.items():
monomers = sorted(monomers,
key=lambda x: x.GetIntProp(MONOMER_INDEX_PROP_NAME))
monomer_helm = (
_construct_monomer_helm(monomer) for monomer in monomers)
all_monomers[token_position - 1] = "".join(monomer_helm)
monomers_helm = ".".join(all_monomers)
polymer_helm = f"{polymer_id}{{{monomers_helm}}}"
return polymer_helm
def _cg_mol_to_helm2(cg_mol):
polymers = (_construct_polymer_helm(polymer_id, polymer_info)
for polymer_id, polymer_info in _get_polymers_info(cg_mol))
polymers_helm = "|".join(polymers)
connections = (_construct_connection_helm(bond)
for bond in cg_mol.GetBonds()
if bond.HasProp(CONNECTION_ATTACHMENT_POINTS_PROP_NAME))
connections_helm = "|".join(connections)
return f"{polymers_helm}${connections_helm}$$$V2.0"