Source code for schrodinger.application.scaffold_enumeration.cxsmiles

'''
Functions to parse "repeating units" and "position variant bonds"
from CX SMILES "features" text are not particularly bright, but probably
good enough for machine-generated CX SMILES.
'''

import collections
import html
import json
import re

from rdkit import Chem

from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import log

from . import common
from . import markush

MCG = collections.namedtuple('MCG', ['atoms', 'center'])

MCG.atoms.__doc__ = "List of atom indices ([int])."
MCG.center.__doc__ = "Central atom index (int)."

SRU = collections.namedtuple('SRU', ['atoms', 'subscript', 'superscript'])

SRU.atoms.__doc__ = "List of atom indices ([int])."
SRU.subscript.__doc__ = "SRU's subscript (str)."
SRU.superscript.__doc__ = "SRU's superscript (str)."

CXSMILES_DATA_PROP = '_CXSMILES_Data'

logger = log.get_output_logger(__name__)

#------------------------------------------------------------------------------#


[docs]def parse_mcg(text, pos, accum): ''' Parses "multi-center SGroup" data from CX SMILES "features". <quote> The multicenter atom indexes written after "m:" followed by a colon character and the indexes of the atoms which forms the given SGroup separated by ".". The SGroups are separated by commas. Example: "m:0:7.6.5.4.3,2:12.11.10.9.8,C:0.0,2.1" </quote> :param text: CX SMILES "features" string. :type text: str :param pos: Index of the character in `text` right after "m:". :type pos: int :param accum: List to which the "SGroups" are to be appended. :type accum: list :return: Index of the first unconsumed character in `text`. :rtype: int ''' start = pos while pos < len(text): if text[pos] not in '.:0123456789': atoms = [int(t) for t in re.split(r'[.:]', text[start:pos]) if t] if len(atoms) > 1: accum.append(MCG(center=atoms[0], atoms=atoms[1:])) start = pos + 1 if text[pos] != ',': break pos += 1 return pos
#------------------------------------------------------------------------------#
[docs]def parse_sru(text, pos, accum): ''' Parses "SRU" data from CX SMILES "features". <quote> Polymer Sgroups Each Sgroup exported after "Sg:" in fields separated by a colon. Fields are: 1. Sgroup type keyword. Valid keywords are: +---------+------------+ | Keyword | Sgroup Type| | n | SRU | | ... | ... | +----------------------+ 2. Atom indexes separated with commas. 3. Subscript of the Sgroup. If the supscript equals the keyword of the Sgroup this field can be empty. Escaped field. 4. Superscript of the Sgroup. In the superscript only connectivity and flip information is allowed. This field can be empty. Escaped field. 5. Head crossing bond indexes. The indexes of bonds that share a common bracket in case of ladder-type polymers. This field can be empty. 6. Tail crossing bond indexes. The indexes of bonds that share a common bracket in case of ladder-type polymers. This field can be empty. 7. If the c export option is present then bracket orientation, bracket type followed by the coordinates (4 pair, separated with commas). Bracket orientation can be s or d (single or double), bracket type can be b,c,r,s for braces, chevrons, round and square, respectively. The brackets are written between parentheses and separated with semicolons. A colon is needed after the last non-empty field. If one needs to retain not only the chemically relevant information, but the whole structure (as drawn), then the c export option should be used. Examples:: CCCC |Sg:gen:0,1,2:| CCCC |Sg:n:0,1,2:3-6:eu| *CC(*)C(*)N* |Sg:n:6,1,2,4::hh&#44;f:6,0,:4,2,| </quote> In addition: <quote> Escaping In some places special characters are escaped to '&#code' where code is the ASCII code of the special character. Not escaped characters in fields of Sgroups and DataSgroups: 'a'-'z', 'A'-'Z', '0'-'9' and '><\"!@#$%()[]./\\?-+*^_~=' and the space character. Not escaped characters in atom property keys and values: 'a'-'z', 'A'-'Z', '0'-'9' and '><\"!@#$%()[]./\\?-+*^_~=' and the space character. Not escaped characters in atom labels and atom values: 'a'-'z', 'A'-'Z', '0'-'9' and '><\"!@#%()[]./\\?-+*^_~=,:' and the space character. </quote> This subroutine recognizes only: atoms (2), subscript (3), and superscript (4). :param text: CX SMILES "features" string. :type text: str :param pos: Index of the character in `text` right after "Sg:n:". :type pos: int :param accum: List to which the "SGroups" are to be appended. :type accum: list :return: Index of the first unconsumed character in `text`. :rtype: int ''' start = pos fields = [] while pos < len(text): # despite of the specs above, third field of a legit SRU may # terminate at comma or at the end of stream if text[pos] == ':' or (len(fields) == 2 and text[pos] in ',|'): fields.append(text[start:pos]) start = pos + 1 if len(fields) == 3: break pos += 1 if len(fields) == 3: atoms = [int(t) for t in fields[0].split(',') if t] if atoms: accum.append( SRU(atoms=atoms, subscript=html.unescape(fields[1]), superscript=html.unescape(fields[2]))) return pos
#------------------------------------------------------------------------------#
[docs]def parse_cx_extensions(text): ''' Parses: (a) multi-center groups and (b) SRUs. :param text: CX extensions to be parsed. :type text: str :return: Tuple ot lists that hold the MCGs and SRUs. :rtype: (list(MCG), list(SRU)) ''' mcgs = [] # "multi-center SGroups" srus = [] # "S-omething repeating units" if len(text) < 2 or text[0] != '|': raise ValueError('does not start with |') pos = 1 while pos < len(text) and text[pos] != '|': if text.startswith('Sg:n:', pos): pos = parse_sru(text, pos + 5, srus) elif text.startswith('m:', pos): pos = parse_mcg(text, pos + 2, mcgs) else: pos += 1 if pos + 1 != len(text) or text[pos] != '|': raise ValueError('does not end with |') return mcgs, srus
#------------------------------------------------------------------------------# def _mcgs_and_srus_as_rdcml_json(mol, mcgs, srus): ''' Translates "multi-center groups" and "repeating units" into JSON following MRV conventions. :param mol: Molecule. :type mol: rdkit.Chem.Mol :param mcgs: List of "multi-center groups". :type mcgs: list(MCG) :param srus: List of "repeating units". :type srus: list(SRU) :return: JSON representation of `mcgs` and `srus`. :rtype: str ''' id2idx = common.get_atom_id_map(mol) idx2id = {v: k for k, v in id2idx.items()} sgroups = [] for mcg in mcgs: try: center = idx2id[mcg.center] atomRefs = ' '.join(idx2id[i] for i in mcg.atoms) except KeyError: logger.warning('invalid atom indices in a multicenter group') continue else: sgroups.append({ 'role': 'MulticenterSgroup', 'center': center, 'atomRefs': atomRefs }) for sru in srus: try: atomRefs = ' '.join(idx2id[i] for i in sru.atoms) except KeyError: logger.warning('invalid atom indices in a repeating unit') continue else: sgroups.append({ 'role': 'SruSgroup', 'title': sru.subscript, 'connect': sru.superscript, 'atomRefs': atomRefs }) return json.dumps(sgroups) #------------------------------------------------------------------------------#
[docs]def mol_from_cxsmiles(text, parseName=True): ''' Strives to instantiate `rdkit.Chem.Mol` from `text` assuming that the latter is CX SMILES. :param text: CX SMILES string. :type text: str :param parseName: Parse molecule title? :type parseName: bool :return: Molecule or None :rtype: rdkit.Chem.Mol or NoneType ''' params = Chem.SmilesParserParams() params.allowCXSMILES = True params.parseName = parseName try: with rdkit_adapter.convert_log_to_exception(): mol = Chem.MolFromSmiles(text, params) if mol is None: raise ValueError('unknown error') except Exception as e: logger.warning(f'WARNING: could not process "{text.strip()}": {e}') return None # adapt "repeating units" and "position variant bonds" if mol.HasProp(CXSMILES_DATA_PROP): cxfeatures = mol.GetProp(CXSMILES_DATA_PROP) try: mcgs, srus = parse_cx_extensions(cxfeatures) except Exception: logger.warning(f'could not parse CX SMILES features "{cxfeatures}"') else: if mcgs or srus: mol.SetProp(common.CML_PROP_PREFIX + 'sgroups', _mcgs_and_srus_as_rdcml_json(mol, mcgs, srus)) markush.canonicalize_R_labels(mol) return mol
#------------------------------------------------------------------------------#