Source code for schrodinger.application.scaffold_enumeration.cxsmiles

'''
Functions to parse "repeating units" and "position variant bonds"
from CX SMILES "features" text are not particularly bright, but probably
good enough for machine-generated CX SMILES.
'''

import collections
import html
import json
import re

from rdkit import Chem

from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import log

from . import common
from . import markush

MCG = collections.namedtuple('MCG', ['atoms', 'center'])

MCG.atoms.__doc__ = "List of atom indices ([int])."
MCG.center.__doc__ = "Central atom index (int)."

SRU = collections.namedtuple('SRU', ['atoms', 'subscript', 'superscript'])

SRU.atoms.__doc__ = "List of atom indices ([int])."
SRU.subscript.__doc__ = "SRU's subscript (str)."
SRU.superscript.__doc__ = "SRU's superscript (str)."

CXSMILES_DATA_PROP = '_CXSMILES_Data'

logger = log.get_output_logger(__name__)

#------------------------------------------------------------------------------#


[docs]def parse_mcg(text, pos, accum):
    '''
    Parses "multi-center SGroup" data from CX SMILES "features".

    <quote>

    The multicenter atom indexes written after "m:" followed by a
    colon character and the indexes of the atoms which forms the
    given SGroup separated by ".". The SGroups are separated by commas.

    Example: "m:0:7.6.5.4.3,2:12.11.10.9.8,C:0.0,2.1"

    </quote>

    :param text: CX SMILES "features" string.
    :type text: str

    :param pos: Index of the character in `text` right after "m:".
    :type pos: int

    :param accum: List to which the "SGroups" are to be appended.
    :type accum: list

    :return: Index of the first unconsumed character in `text`.
    :rtype: int
    '''

    start = pos

    while pos < len(text):
        if text[pos] not in '.:0123456789':
            atoms = [int(t) for t in re.split(r'[.:]', text[start:pos]) if t]
            if len(atoms) > 1:
                accum.append(MCG(center=atoms[0], atoms=atoms[1:]))
            start = pos + 1
            if text[pos] != ',':
                break
        pos += 1

    return pos


#------------------------------------------------------------------------------#


[docs]def parse_sru(text, pos, accum):
    '''
    Parses "SRU" data from CX SMILES "features".

    <quote>

    Polymer Sgroups
    Each Sgroup exported after "Sg:" in fields separated by a colon. Fields are:

    1. Sgroup type keyword. Valid keywords are:

    +---------+------------+
    | Keyword | Sgroup Type|
    | n       | SRU        |
    | ...     | ...        |
    +----------------------+

    2. Atom indexes separated with commas.

    3. Subscript of the Sgroup. If the supscript equals the keyword of
       the Sgroup this field can be empty. Escaped field.

    4. Superscript of the Sgroup. In the superscript only connectivity
       and flip information is allowed. This field can be empty. Escaped
       field.

    5. Head crossing bond indexes. The indexes of bonds that share a
       common bracket in case of ladder-type polymers. This field can be
       empty.

    6. Tail crossing bond indexes. The indexes of bonds that share a
       common bracket in case of ladder-type polymers. This field can be
       empty.

    7. If the c export option is present then bracket orientation,
       bracket type followed by the coordinates (4 pair, separated with
       commas). Bracket orientation can be s or d (single or double),
       bracket type can be b,c,r,s for braces, chevrons, round and square,
       respectively. The brackets are written between parentheses and
       separated with semicolons.

    A colon is needed after the last non-empty field.

    If one needs to retain not only the chemically relevant information,
    but the whole structure (as drawn), then the c export option should
    be used.

    Examples::

        CCCC |Sg:gen:0,1,2:|
        CCCC |Sg:n:0,1,2:3-6:eu|
        *CC(*)C(*)N* |Sg:n:6,1,2,4::hh&#44;f:6,0,:4,2,|

    </quote>

    In addition:

    <quote>

    Escaping

    In some places special characters are escaped to '&#code' where code
    is the ASCII code of the special character.

    Not escaped characters in fields of Sgroups and DataSgroups:
    'a'-'z', 'A'-'Z', '0'-'9' and '><\"!@#$%()[]./\\?-+*^_~=' and the
    space character.

    Not escaped characters in atom property keys and values: 'a'-'z',
    'A'-'Z', '0'-'9' and '><\"!@#$%()[]./\\?-+*^_~=' and the space
    character.

    Not escaped characters in atom labels and atom values: 'a'-'z',
    'A'-'Z', '0'-'9' and '><\"!@#%()[]./\\?-+*^_~=,:' and the space
    character.

    </quote>

    This subroutine recognizes only:
        atoms (2), subscript (3), and superscript (4).

    :param text: CX SMILES "features" string.
    :type text: str

    :param pos: Index of the character in `text` right after "Sg:n:".
    :type pos: int

    :param accum: List to which the "SGroups" are to be appended.
    :type accum: list

    :return: Index of the first unconsumed character in `text`.
    :rtype: int
    '''

    start = pos

    fields = []

    while pos < len(text):
        # despite of the specs above, third field of a legit SRU may
        # terminate at comma or at the end of stream
        if text[pos] == ':' or (len(fields) == 2 and text[pos] in ',|'):
            fields.append(text[start:pos])
            start = pos + 1
        if len(fields) == 3:
            break
        pos += 1

    if len(fields) == 3:
        atoms = [int(t) for t in fields[0].split(',') if t]
        if atoms:
            accum.append(
                SRU(atoms=atoms,
                    subscript=html.unescape(fields[1]),
                    superscript=html.unescape(fields[2])))

    return pos


#------------------------------------------------------------------------------#


[docs]def parse_cx_extensions(text):
    '''
    Parses: (a) multi-center groups and (b) SRUs.

    :param text: CX extensions to be parsed.
    :type text: str

    :return: Tuple ot lists that hold the MCGs and SRUs.
    :rtype: (list(MCG), list(SRU))
    '''

    mcgs = []  #  "multi-center SGroups"
    srus = []  #  "S-omething repeating units"

    if len(text) < 2 or text[0] != '|':
        raise ValueError('does not start with |')

    pos = 1

    while pos < len(text) and text[pos] != '|':
        if text.startswith('Sg:n:', pos):
            pos = parse_sru(text, pos + 5, srus)
        elif text.startswith('m:', pos):
            pos = parse_mcg(text, pos + 2, mcgs)
        else:
            pos += 1

    if pos + 1 != len(text) or text[pos] != '|':
        raise ValueError('does not end with |')

    return mcgs, srus


#------------------------------------------------------------------------------#


def _mcgs_and_srus_as_rdcml_json(mol, mcgs, srus):
    '''
    Translates "multi-center groups" and "repeating units" into JSON
    following MRV conventions.

    :param mol: Molecule.
    :type mol: rdkit.Chem.Mol

    :param mcgs: List of "multi-center groups".
    :type mcgs: list(MCG)

    :param srus: List of "repeating units".
    :type srus: list(SRU)

    :return: JSON representation of `mcgs` and `srus`.
    :rtype: str
    '''

    id2idx = common.get_atom_id_map(mol)
    idx2id = {v: k for k, v in id2idx.items()}

    sgroups = []

    for mcg in mcgs:
        try:
            center = idx2id[mcg.center]
            atomRefs = ' '.join(idx2id[i] for i in mcg.atoms)
        except KeyError:
            logger.warning('invalid atom indices in a multicenter group')
            continue
        else:
            sgroups.append({
                'role': 'MulticenterSgroup',
                'center': center,
                'atomRefs': atomRefs
            })

    for sru in srus:
        try:
            atomRefs = ' '.join(idx2id[i] for i in sru.atoms)
        except KeyError:
            logger.warning('invalid atom indices in a repeating unit')
            continue
        else:
            sgroups.append({
                'role': 'SruSgroup',
                'title': sru.subscript,
                'connect': sru.superscript,
                'atomRefs': atomRefs
            })

    return json.dumps(sgroups)


#------------------------------------------------------------------------------#


[docs]def mol_from_cxsmiles(text, parseName=True):
    '''
    Strives to instantiate `rdkit.Chem.Mol` from `text` assuming that
    the latter is CX SMILES.

    :param text: CX SMILES string.
    :type text: str

    :param parseName: Parse molecule title?
    :type parseName: bool

    :return: Molecule or None
    :rtype: rdkit.Chem.Mol or NoneType
    '''

    params = Chem.SmilesParserParams()
    params.allowCXSMILES = True
    params.parseName = parseName

    try:
        with rdkit_adapter.convert_log_to_exception():
            mol = Chem.MolFromSmiles(text, params)
        if mol is None:
            raise ValueError('unknown error')

    except Exception as e:
        logger.warning(f'WARNING: could not process "{text.strip()}": {e}')
        return None

    # adapt "repeating units" and "position variant bonds"

    if mol.HasProp(CXSMILES_DATA_PROP):
        cxfeatures = mol.GetProp(CXSMILES_DATA_PROP)
        try:
            mcgs, srus = parse_cx_extensions(cxfeatures)
        except Exception:
            logger.warning(f'could not parse CX SMILES features "{cxfeatures}"')
        else:
            if mcgs or srus:
                mol.SetProp(common.CML_PROP_PREFIX + 'sgroups',
                            _mcgs_and_srus_as_rdcml_json(mol, mcgs, srus))

    markush.canonicalize_R_labels(mol)

    return mol


#------------------------------------------------------------------------------#