Source code for schrodinger.application.matsci.smartsutils

"""
Utilities for working with SMARTS patterns

Copyright Schrodinger, LLC. All rights reserved.
"""

import string
import warnings
from collections import defaultdict
from collections import namedtuple

from rdkit import Chem

from schrodinger.application.matsci import msprops
from schrodinger.application.matsci import msutils
from schrodinger.structutils import analyze

SMARTSGroupData = namedtuple('SMARTSGroupData',
                             ['number', 'name', 'pattern', 'indexes'],
                             defaults=[None, None])

GROUP_STRUCT_PROP_BASE = 's_matsci_smarts_group_%s'
GROUP_STRUCT_COUNT = 'i_matsci_smarts_group_matches_%s'

# see MATSCI-9181 - keep these for backwards compatibility
GROUP_NAME_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEX_PROP = 'i_matsci_smarts_group_atom_index'
GROUP_NUMBER_PROP = 'i_matsci_smarts_group_number'

GROUP_NAMES_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEXES_PROP = 's_matsci_smarts_group_atom_index'
GROUP_NUMBERS_PROP = 's_matsci_smarts_group_number'

ATOM_PROP_DELIM = ','

VALID_NAME_PUNCTUATION = '_-()[]'
VALID_NAME_CHARS = {
    x for x in string.ascii_letters + string.digits + VALID_NAME_PUNCTUATION
}
VALID_NAME_DESC_TEXT = ('letters, numbers and the special characters %s' %
                        VALID_NAME_PUNCTUATION)


[docs]def defines_integer_atom_props(st): """ Return True if the given structure has SMARTS-related integer atom properties defined. These properties were originally used when each atom was forced to belong to a single SMARTS group. That condition has been relaxed but these properties are kept for backwards compatibility. :type st: schrodinger.structure.Structure :param st: the structure :rtype: bool :return: True if such properties are defined """ return msutils.has_atom_property(st, GROUP_ATOM_INDEX_PROP) or \ msutils.has_atom_property(st, GROUP_NUMBER_PROP)
[docs]def get_group_names(atom): """ Return a list of group names for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: group names """ tokens = atom.property.get(GROUP_NAMES_PROP) if tokens: return tokens.split(ATOM_PROP_DELIM) return []
[docs]def get_group_atom_indices(atom): """ Return a list of group atom indices for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: group atom indices """ index = atom.property.get(GROUP_ATOM_INDEX_PROP) if index: return [index] tokens = atom.property.get(GROUP_ATOM_INDEXES_PROP) if tokens: return list(map(int, tokens.split(ATOM_PROP_DELIM))) return []
[docs]def get_group_numbers(atom): """ Return a list of group numbers for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: group numbers """ number = atom.property.get(GROUP_NUMBER_PROP) if number: return [number] tokens = atom.property.get(GROUP_NUMBERS_PROP) if tokens: return list(map(int, tokens.split(ATOM_PROP_DELIM))) return []
[docs]def append_property(atom, key, value): """ Append the given property to the atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :type key: str :param key: the property key :type value: str :param value: the property value """ old_value = atom.property.get(key) if old_value: value = f'{old_value}{ATOM_PROP_DELIM}{value}' atom.property[key] = str(value)
[docs]def validate_name(name): """ Make sure name has the correct set of characters :type name: str :param name: The string to check :rtype: bool :return: True if name has no invalid characters, False if any characters are invalid """ return all(x in VALID_NAME_CHARS for x in name)
[docs]class SMARTSGroupError(Exception): """ Class for exceptions related to SMARTS group finding """
[docs]def delete_group_properties(struct): """ Delete all SMARTS group properties (structure and atom) from the structure :type struct: `schrodinger.structure.Structure` :param struct: The structure to delete properties from """ # Delete any structure properties that give group names or count sbase = GROUP_STRUCT_PROP_BASE % "" cbase = GROUP_STRUCT_COUNT % "" tuple_base = (sbase, cbase) for prop in list(struct.property): if prop.startswith(tuple_base): del struct.property[prop] # Delete any atom-based properties for atom in struct.atom: for prop in [ GROUP_NAMES_PROP, GROUP_ATOM_INDEXES_PROP, GROUP_NUMBERS_PROP, GROUP_ATOM_INDEX_PROP, GROUP_NUMBER_PROP ]: if prop in atom.property: del atom.property[prop]
[docs]def find_group_data(struct): """ Find an SMARTS group data on the structure :type struct: `schrodinger.structure.Structure` :param struct: The structure to find groups on :rtype: dict :return: A dictionary. Keys are smarts group numbers, values are `SMARTSGroupData` named tuples for the SMARTS group with that number :raise `SMARTSGroupError`: If something in the data is not consistent """ groups = {} for atom in struct.atom: # validate gindices = get_group_numbers(atom) if not gindices: continue names = get_group_names(atom) if not names: raise SMARTSGroupError('Atom has at least a single ' 'group index but no group names.') if len(gindices) != len(names): raise SMARTSGroupError('Atom belongs to %s groups but ' 'has %s group names.' % (len(gindices), len(names))) for name in names: pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name) if not pattern: raise SMARTSGroupError('There is no pattern specified ' 'for group name %s.' % name) # build groups for gindex, name in zip(gindices, names): if gindex in groups: # New atom for existing group group = groups[gindex] if name != group.name: raise SMARTSGroupError('At least two names are used for ' 'group number %d: %s and %s' % (gindex, name, group.name)) groups[gindex].indexes.append(atom.index) else: # New group pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name) inds = [atom.index] groups[gindex] = SMARTSGroupData(number=gindex, name=name, pattern=pattern, indexes=inds) # Test that all groups with the same name have the same number of atoms numat = {} for group in groups.values(): total = numat.setdefault(group.name, len(group.indexes)) if len(group.indexes) != total: raise SMARTSGroupError('Group %s has an inconsistent number of ' 'atoms. Found groups with this name ' 'containing %d and %d atoms.' % (group.name, len(group.indexes), total)) return groups
[docs]def get_rdkit_atoms(smarts): """ Return a collection of rdkit atoms for the given SMARTS. The return value has the length of a potential match group, for example for 'cc' this length is 2, for '[$([NH]([CH2])[CH2])]C' it is 2, for [n-0X2].[n-0X2] it is 2, etc., even though there might be any number of matches if the pattern was matched. :type smarts: str :param smarts: the SMARTS pattern :raise RuntimeError: if rdkit has a problem with the SMARTS :rtype: rdkit.Chem.rdchem._ROAtomSeq :return: the rdkit atoms """ # recursive and component SMARTS supported amol = Chem.MolFromSmarts(smarts) if amol is None: msg = ('Rdkit fails for the given SMARTS of ' '{smarts}.').format(smarts=smarts) raise RuntimeError(msg) return amol.GetAtoms()
[docs]def is_smarts_bonding_pair(smarts): """ Return True if the given SMARTS would match a bonding pair, False otherwise. :type smarts: str :param smarts: the SMARTS pattern :rtype: bool :return: True if the SMARTS would match a bonding pair, False otherwise """ # the smarts could be a component pattern like [n-0X2].[n-0X2] # which is a non-bonding pair atoms = get_rdkit_atoms(smarts) if len(atoms) != 2: return False bonds = atoms[0].GetBonds() return bool(bonds)
[docs]class SMARTSGroup(object): """ Handles matching and record-keeping for a SMARTS patter """
[docs] def __init__(self, name, pattern, logger=None): """ Create a SMARTSGroup object :type name: str :param name: The name of this SMARTS group :type pattern: str :param pattern: The SMARTS pattern for this group :raise ValueError: If name has invalid characters :raise ValueError: If the SMARTS is invalid """ if not validate_name(name): raise ValueError( 'The SMARTS group name, %s, should only contain %s' % (name, VALID_NAME_DESC_TEXT)) with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", category=DeprecationWarning, message=r"analyze\.validate_smarts") valid, msg = analyze.validate_smarts(pattern) if not valid: raise ValueError('The SMARTS pattern %s is not valid. Error was: ' '\n%s' % (pattern, msg)) self.name = name self.pattern = pattern self.sprop = GROUP_STRUCT_PROP_BASE % self.name self.cprop = GROUP_STRUCT_COUNT % self.name self.last_number = 0 self.logger = logger
[docs] def nextNumber(self, numbers_used): """ Get the next unused group number :type numbers_used: set :param numbers_used: Each member is a number that has already been used for a group and is unavailable. The number returned by this function is added to the numbers_used set. :rtype: int :return: The lowest available number. This number will have been added to the numbers_used set. """ while True: self.last_number += 1 if self.last_number not in numbers_used: numbers_used.add(self.last_number) return self.last_number
[docs] def prioritizeBackbone(self, matches, backbone_atoms): """ Prioritize matches that are in backbone of the molecule :param matches: List of list containing the smart pattern matches :type matches: list :type backbone_atoms: dict :param backbone_atoms: dictionary with key as molecule number and backbone atoms index ordered in a list :returns: List of list containing the smart pattern matches where the matches in the backbone appears first :rtype: list """ sorted_matches = [] for mol_num in sorted(matches): # get matches and backbone atoms for the molecule mol_matches = matches[mol_num] mol_back_atoms = backbone_atoms.get(mol_num, []) backbone_positions = {y: x for x, y in enumerate(mol_back_atoms)} # List to hold positions for matches where at least one atom is in # in the backbone pos_backbone_matches = [] # List to hold positions for matches where there is no match in the # backbone of the molecule index_sidechain_matches = [] for match in mol_matches: if not match: continue # Loop through atom indexes of the smarts match and store their # postion in the backbone match_bb_positions = [ backbone_positions[x] for x in match if x in backbone_positions ] if match_bb_positions: pos_backbone_matches.append( (min(match_bb_positions), match)) else: # In case none of the match atoms are in backbone, use the # lowest atom index of the match index_sidechain_matches.append((min(match), match)) for value_match in [pos_backbone_matches, index_sidechain_matches]: sorted_matches.extend([y for x, y in sorted(value_match)]) return sorted_matches
def _getUniqueMonomersInMatches(self, struct, matches): """ Split the matches into dictionaries depending on the number of unique monomer constituting the match. :type struct: `schrodinger.structure.Structure` :param struct: The structure to find SMARTS pattern match in :param matches: List of list containing the smart pattern matches :type matches: list :returns: Dictionary where the values are list of matches and the key is the number of unique monomers to which the match belongs to. :rtype: dict """ monomers_idx_matches = defaultdict(list) for match in matches: # Find the monomer number of atom ids in the match monomer_numbers = set() for idx in match: monomer_num = struct.atom[idx].property.get( msprops.MONOMER_NUMBER) if monomer_num is not None: monomer_numbers.add(monomer_num) monomers_idx_matches[len(monomer_numbers)].append(match) return dict(monomers_idx_matches)
[docs] def prioritizeSameMonomers(self, struct, matches): """ Prioritize matches that belong to same (or least) number of unique monomers. :type struct: `schrodinger.structure.Structure` :param struct: The structure to find SMARTS pattern match in :param matches: List of list containing the smart pattern matches :type matches: list :returns: List of list containing the smart pattern matches where the matches that belong to same (or least) number of unique monomers appear first :rtype: list """ monomers_idx_matches = self._getUniqueMonomersInMatches(struct, matches) sorted_matches = [] for num_monomers in sorted(monomers_idx_matches): sorted_matches.extend(monomers_idx_matches[num_monomers]) return sorted_matches
[docs] def orderedMatches(self, struct, backbone_atoms): """ Evaluate the smarts pattern matches, where matches are ordered to follow network sequence. Consider backbone atoms matches first in the sequence and the side chain matches are then ordered according to atom index. :param `schrodinger.structure.Structure` struct: The structure to delete properties from :param dict backbone_atoms: dictionary with key as molecule number and backbone atoms index ordered in a list :return list(list): List of list containing the smart pattern matches """ # Find matches in for the smart pattern with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", category=DeprecationWarning, message=r"analyze\.evaluate_smarts") matches = analyze.evaluate_smarts_by_molecule(struct, self.pattern, canvas=False, matches_by_mol=True, unique_sets=True) backbone_sorted_matches = self.prioritizeBackbone( matches, backbone_atoms) monomer_prioritized_matches = self.prioritizeSameMonomers( struct, backbone_sorted_matches) return monomer_prioritized_matches
[docs] def getSmartsDictData(self, struct, match, allow_partial_overlap): """ Gets the smarts dictionary data for the passed match :type struct: `schrodinger.structure.Structure` :param struct: The structure to which the SMARTS match belongs to :param match: The list of atom indices that belong to the SMARTS match :type match: list :param allow_partial_overlap: Whether partial overlap is allowed :type allow_partial_overlap: bool :return: Dictionary where the key is a SMARTSGroupData and the value is the list of atom indices that belong to the group :rtype: dict :raises ValueError: If overlapping is not supported due to backwards incompatibility """ smarts_group_data_dict = defaultdict(list) for idx in match: atom = struct.atom[idx] if allow_partial_overlap and ( atom.property.get(GROUP_ATOM_INDEX_PROP) or atom.property.get(GROUP_NUMBER_PROP)): raise ValueError( 'Backwards compatibility is supported but not here when ' 'allowing partial overlaps.') for group_name, group_number in zip(get_group_names(atom), get_group_numbers(atom)): smarts_group_data = SMARTSGroupData(name=group_name, number=group_number) smarts_group_data_dict[smarts_group_data].append(idx) return smarts_group_data_dict
[docs] def isAlreadyMatched(self, match, smarts_group_matches): """ Check if atoms have already been matched to the same SMARTS pattern. This allows creating a group from some atoms already matched to some group and other atoms already matched to some other group but prevents creating a group that is a sub-group of another group :param match: The list of atom indices that belong to the SMARTS match :type match: list :param smarts_group_matches: Dictionary where the key is a unique SMARTS pattern and the value is the list of atom indices that belong to the group :type smarts_group_matches: dict :rtype: bool :return: Whether the match has already been matched to same SMARTS """ for idxs in smarts_group_matches.values(): if set(match) == set(idxs): return True return False
[docs] def onlyDifferByAHydrogen(self, struct, match, smarts_group_matches): """ Check all atoms except a single H atom have already been matched to the same SMARTS group. This prevents matching a terminal methyl 3 times as is the case for a polymer head or tail monomer with a terminating hydrogen :type struct: `schrodinger.structure.Structure` :param struct: The structure to which the SMARTS match belongs to :param match: The list of atom indices that belong to the SMARTS match :type match: list :param smarts_group_matches: Dictionary where the key is a unique SMARTS pattern and the value is the list of atom indices that belong to the group :type smarts_group_matches: dict :rtype: bool :return: Whether all atoms except a single H atom have already been matched """ for idxs in smarts_group_matches.values(): if len(idxs) + 1 == len(match): diff_idxs = list(set(match).difference(idxs)) if (len(diff_idxs) == 1 and struct.atom[diff_idxs[0]].atomic_number == 1): return True return False
[docs] def isOverlapAllowedInMatch(self, struct, match, smarts_group_data_dict): """ Determines if overlap is allowed for the current match. :type struct: `schrodinger.structure.Structure` :param struct: The structure to which the SMARTS match belongs to :param match: The list of atom indices that belong to the SMARTS match :type match: list :param smarts_group_data_dict: Dictionary where the key is a SMARTSGroupData and the value is the list of atom indices that belong to the group :type smarts_group_data_dict: dict :rtype: bool :return: Whether overlapping is allowed for the current match """ # Combine groups with same name to avoid multiple overlapping of same # smarts type smarts_group_matches = defaultdict(list) for smarts_data, overlap_atom_ids in smarts_group_data_dict.items(): smarts_group_matches[smarts_data.name].extend(overlap_atom_ids) for match_dict in [smarts_group_matches, smarts_group_data_dict]: if (self.isAlreadyMatched(match, match_dict) or self.onlyDifferByAHydrogen(struct, match, match_dict)): return False return True
[docs] def addMatchToAtoms(self, struct, numbers_used, match): """ Adds match information to structure atom property :type struct: `schrodinger.structure.Structure` :param struct: The structure to which the SMARTS match belongs to :type numbers_used: set :param numbers_used: Each member is a number that has already been used for a group and is unavailable. The number returned by this function is added to the numbers_used set. :param match: The list of atom indices from the match :type match: list """ number = self.nextNumber(numbers_used) for match_index, atom_index in enumerate(match, 1): atom = struct.atom[atom_index] append_property(atom, GROUP_NAMES_PROP, self.name) append_property(atom, GROUP_ATOM_INDEXES_PROP, match_index) append_property(atom, GROUP_NUMBERS_PROP, number)
[docs] def log(self, msg): """ Log message if logger is present :param msg: The message to log :type msg: str """ if self.logger: self.logger(msg)
[docs] def match(self, struct, numbers_used, backbone_atoms, allow_partial_overlap=False): """ Find all the SMARTS groups matching the SMARTS pattern and mark them with the appropriate properties :param `schrodinger.structure.Structure` struct: The structure to find SMARTS pattern match in :param set numbers_used: A set of all the group numbers that have been used and are unavailable :param dict backbone_atoms: dictionary with key as molecule number and backbone atoms index ordered in a list :param bool allow_partial_overlap: whether atoms can belong to multiple groups """ self.last_number, valid_matches = 0, 0 self.log( f'Matching Group: name = {self.name}, pattern = {self.pattern}') matches = self.orderedMatches(struct, backbone_atoms) for match in matches: try: smarts_group_data_dict = self.getSmartsDictData( struct, match, allow_partial_overlap) except ValueError: continue if allow_partial_overlap and not self.isOverlapAllowedInMatch( struct, match, smarts_group_data_dict): continue elif not allow_partial_overlap and smarts_group_data_dict: # if not allowing partial overlaps then none of the atoms # in the match are allowed to have already been matched continue valid_matches += 1 self.addMatchToAtoms(struct, numbers_used, match) if valid_matches: # last_number will be > 0 if there were any valid matches struct.property[self.sprop] = self.pattern struct.property[self.cprop] = valid_matches msg = '%d matches found' % len(matches) if matches: msg += ', %d were tagged as new particles' % valid_matches self.log(msg)