Source code for schrodinger.structutils.filter

"""
Functions and classes for filtering structure files based on properties or
SMARTS patterns. Supports filter files in the formats used by propfilter and
canvasSearch, respectively. The filter classes support both Structure and Mol
objects.

Simple example::

    prop_filter = PropFilter(filename='filters.txt')
    reader = StructureReader('structs.maegz'):
    for st in prop_filter.filter(reader):
        # st matches; do something with it

    smarts_filter = SmartsFilter(filename='filters.cflt')
    for st in smarts_filter.filter(reader):
        # st matches; do something with it

Copyright Schrodinger, LLC. All rights reserved.

"""

import csv
import functools
import operator
import re

import pyparsing as pp
from rdkit import Chem

from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import log

logger = log.get_output_logger("schrodinger.analysis.reaction")


[docs]class SingleFilter(object): """ Base class for single filters. Derived classes must implement checkStructure. """
[docs] def checkStructure(self, st_or_mol): """ Return True if st_or_mol passes the filters; False otherwise. Not implemented in the base class. :type st_or_mol: Structure or Mol :rtype: bool """ raise NotImplementedError
[docs]class Filter(object): """ Base class for filtering structures. The specific filters to use are determined by the SingleFilter objects passed to .append(), or by the file read with readFile(). """
[docs] def __init__(self, filters=None, filename=None, **kwds): """ Create a filter object, optionally with a source for the filter conditions. :param kwds: additional keywords to pass to readFile or readFileName :type filters: iterable of str :type filename: str """ self.filters = [] if filters: self.readFile(filters, **kwds) if filename: self.readFileName(filename, **kwds) self._last_id = None self._last_mol = None
[docs] def readFileName(self, filename, **kwds): """ Add filter conditions given a filename. :param kwds: additional keywords to pass to readFile :type filename: str """ with open(filename) as fh: self.readFile(fh, **kwds)
[docs] def readFile(self, fh): """ Add filter conditions given a file-like object. Not implemented in the base class. :type fh: iterable of str """ raise NotImplementedError
[docs] def append(self, filter): """ Add a filter the PropFilter object. :type expr: SingleFilter """ self.filters.append(filter)
[docs] def checkStructure(self, st_or_mol, max_violations=0): """ Return True if st_or_mol passes the filters; False otherwise. :type st_or_mol: Structure or Mol :rtype: bool """ violations = 0 for filter in self.filters: mol = self._adaptStructure(st_or_mol) if not filter.checkStructure(mol): violations += 1 if violations > max_violations: return False return True
[docs] def filter(self, structures, **kwds): """ A generator that returns only the structures from 'structures' that pass the filter conditions. :type structures: iterable of Structure or Mol """ return (st for st in structures if self.checkStructure(st, **kwds))
_mol_adapter = None _mol_class = None def _adaptStructure(self, st_or_mol): """ Convert st_or_mol to the class specified by cls._mol_class by calling the cls._mol_adapter function. Does nothing if the object is already and instance of _mol_class or there is no _mol_adapter. This function is called by checkStructure before doing the actual checking of the filters. :type st_or_mol: Mol or Structure :rtype: cls._mol_class """ if self._mol_adapter and not isinstance(st_or_mol, self._mol_class): # Optimization: check if it's the same as the last structure if id(st_or_mol) == self._last_id: mol = self._last_mol else: mol = self._mol_adapter(st_or_mol) self._last_id = id(st_or_mol) self._last_mol = mol return mol else: return st_or_mol
[docs]class SinglePropFilter(SingleFilter): """ Check if a structure satisfies an expression testing a single property. The expression uses the syntax supported by $SCHRODINGER/utilities/propfilter. For example, "r_i_glide_gscore < -5 > -6". This class and associated functions support both Structure objects and RDKit Mol objects. """
[docs] def __init__(self, expr): self.instructions = parse_filter_expression(expr) self.propname = self.instructions.pop(0)
[docs] def checkStructure(self, st_or_mol): """ Return True if st_or_mol passes the filters; False otherwise. :type st_or_mol: Structure or Mol :rtype: bool """ if not _has_prop(st_or_mol, self.propname): return False if not self.instructions: return True # Run the instructions returned by parse_filter_expression prop_val = _get_prop(st_or_mol, self.propname) result_stack = [] for inst in self.instructions: op = inst[0] if len(inst) == 2: left = prop_val right = type(prop_val)(inst[1]) else: right, left = result_stack.pop(), result_stack.pop() result_stack.append(OPERATORS[op](left, right)) return result_stack.pop()
[docs]class PropFilter(Filter): """ Check if a structure satisfies a given list of conditions. Each condition is expressed using the syntax supported by $SCHRODINGER/utilities/propfilter. For example, "r_i_glide_gscore < -5 > -6". This class and associated functions support both Structure objects and RDKit Mol objects. """
[docs] def readFile(self, fh): """ Add the filter conditions given a file-like object. :type fh: iterable of str """ logical_line = '' for line in fh: line = line.strip() if line.startswith('# ') or not line: continue # comment or blank line logical_line += line if line.endswith('\\'): logical_line = logical_line[:-1] # incomplete line continue filter = SinglePropFilter(logical_line) self.append(filter) logical_line = ''
[docs] def getPropertyNames(self): """ Return the set of properties used by all the filters in this object. :rtype: set of str """ return {f.propname for f in self.filters}
[docs]class SingleSmartsFilter(SingleFilter): """ Check if a structure matches a SMARTS pattern a given number of times. """
[docs] def __init__(self, smarts, name, min_matches, max_matches): self.name = name self.min_matches = min_matches self.max_matches = max_matches self.smarts = smarts self.patt = Chem.MolFromSmarts(smarts) if self.patt is None: raise ValueError('Invalid SMARTS: %s' % smarts)
[docs] def checkStructure(self, mol): """ Return True if st_or_mol passes the filters; False otherwise. :type st: Mol :rtype: bool """ matches = mol.GetSubstructMatches(self.patt) count = len(matches) return count >= self.min_matches and count <= self.max_matches
[docs]class SmartsFilter(Filter): """ Check if a structure satisfies a given list of SMARTS filters. Supports reading canvasSearch rule files. """ _mol_adapter = functools.partial(rdkit_adapter.to_rdkit, include_properties=False, include_coordinates=False) _mol_class = Chem.Mol
[docs] def readFile(self, fh, delimiter='\t'): reader = csv.reader(fh, dialect='excel-tab', delimiter=delimiter) for linenum, row in enumerate(reader, 1): ncols = len(row) if len(row) == 0: continue smarts = row[0] try: min_matches = int(row[1]) if ncols > 1 else 0 max_matches = int(row[2]) if ncols > 2 else 0 name = row[3] if ncols > 3 else '' sf = SingleSmartsFilter(smarts, name, min_matches, max_matches) except ValueError as err: # May be due to bad int or bad SMARTS. raise ValueError("%d: %s" % (linenum, err)) self.append(sf)
OPERATORS = { '>': operator.gt, '>=': operator.ge, '<': operator.lt, '<=': operator.le, '==': operator.eq, '!=': operator.ne, '~': lambda l, r: bool(re.search(r, l, re.I)), '!~': lambda l, r: not re.search(r, l, re.I), 'AND': operator.and_, 'OR': operator.or_, } # Parser state variables _last_op = None _instructions = [] _pp_parser = None # cached parser object def _op_action(t): global _last_op _last_op = t[0] def _and_action(t): _instructions.append(("AND",)) def _or_action(t): _instructions.append(("OR",)) def _push_first(t): _instructions.append(t[0]) def _term_action(t): if len(t) == 1: t.insert(0, _last_op) op, val = t _instructions.append((op, val)) return t def _get_filter_parser(): """ Return a parser for propfilter expressions. :rtype: pyparsing.ParserElement """ global _pp_parser if not _pp_parser: point = pp.Literal('.') e = pp.CaselessLiteral('E') plusorminus = pp.Literal('+') | pp.Literal('-') number = pp.Word(pp.nums) integer = pp.Combine(pp.Optional(plusorminus) + number) floatnumber = pp.Combine(integer + pp.Optional(point + pp.Optional(number)) + pp.Optional(e + integer)) string = pp.quotedString.setParseAction(pp.removeQuotes) | pp.Word( pp.printables) op = (pp.Literal('<=') | pp.Literal('<') | pp.Literal('>=') | pp.Literal('>') | pp.Literal('==') | pp.Literal('!=') | pp.Literal('~') | pp.Literal('!~')).setParseAction(_op_action) and_op = pp.Literal('AND') or_op = pp.Literal('OR') property_name = pp.Word(pp.printables).setParseAction(_push_first) value = floatnumber | pp.NotAny(or_op | and_op) + string term = (pp.Optional(op) + value).setParseAction(_term_action) first_term = (op + value).setParseAction(_term_action) and_term = (pp.Optional(and_op) + term).setParseAction(_and_action) and_expr = term + pp.ZeroOrMore(and_term) first_and_expr = first_term + pp.ZeroOrMore(and_term) orexpr = first_and_expr + pp.ZeroOrMore( (or_op + and_expr).setParseAction(_or_action)) _pp_parser = property_name + pp.Optional(orexpr) + pp.StringEnd() return _pp_parser def _prop_basename(propname): """ Strip the <type>_rdkit_ prefix from a property name. """ return re.sub(r'^._rdkit_', '', propname) def _has_prop(st_or_mol, propname): """ Check whether a structure has a property. :type st_or_mol: Structure or Mol :type propname: str :rtype: bool """ try: return propname in st_or_mol.property except AttributeError: return (st_or_mol.HasProp(propname) or st_or_mol.HasProp(_prop_basename(propname))) def _get_prop(st_or_mol, propname): """ Return the value of a property from a structure, which may either be a Structure or a Mol. For the latter, values that look like numbers are returned as floats and anything else as strings. :type st_or_mol: Structure or Mol :type propname: str :rtype: float or str or int or bool """ try: return st_or_mol.property[propname] except AttributeError: try: val = st_or_mol.GetProp(propname) except KeyError: val = st_or_mol.GetProp(_prop_basename(propname)) try: return float(val) except ValueError: return val
[docs]def parse_filter_expression(s, verbose=False): """ Given a filter expression, return a list of instructions for a stack-based machine. The first instruction is a property name. The others are tuples of an operator optionally followed by a value to be compared against the property value. When the tuple only has an operator, it is applied to two values popped from the result stack. For example, "r_i_glide_gscore < -5 > -6' produces ['r_i_glide_gscore', ('<', '-5'), ('>', '-6'), ('AND',)]. :type s: str :rtype: list """ del _instructions[:] results = _get_filter_parser().parseString(s) stack = _instructions[:] logger.debug("Tokens: %s", results) logger.debug("Instructions: %s", stack) return stack