"""
Functions and classes for filtering structure files based on properties or
SMARTS patterns. Supports filter files in the formats used by propfilter and
canvasSearch, respectively. The filter classes support both Structure and Mol
objects.
Simple example::
prop_filter = PropFilter(filename='filters.txt')
reader = StructureReader('structs.maegz'):
for st in prop_filter.filter(reader):
# st matches; do something with it
smarts_filter = SmartsFilter(filename='filters.cflt')
for st in smarts_filter.filter(reader):
# st matches; do something with it
Copyright Schrodinger, LLC. All rights reserved.
"""
import csv
import functools
import operator
import re
import pyparsing as pp
from rdkit import Chem
from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import log
logger = log.get_output_logger("schrodinger.analysis.reaction")
[docs]class SingleFilter(object):
"""
Base class for single filters. Derived classes must implement
checkStructure.
"""
[docs] def checkStructure(self, st_or_mol):
"""
Return True if st_or_mol passes the filters; False otherwise.
Not implemented in the base class.
:type st_or_mol: Structure or Mol
:rtype: bool
"""
raise NotImplementedError
[docs]class Filter(object):
"""
Base class for filtering structures. The specific filters to use are
determined by the SingleFilter objects passed to .append(), or by
the file read with readFile().
"""
[docs] def __init__(self, filters=None, filename=None, **kwds):
"""
Create a filter object, optionally with a source for the filter
conditions.
:param kwds: additional keywords to pass to readFile or readFileName
:type filters: iterable of str
:type filename: str
"""
self.filters = []
if filters:
self.readFile(filters, **kwds)
if filename:
self.readFileName(filename, **kwds)
self._last_id = None
self._last_mol = None
[docs] def readFileName(self, filename, **kwds):
"""
Add filter conditions given a filename.
:param kwds: additional keywords to pass to readFile
:type filename: str
"""
with open(filename) as fh:
self.readFile(fh, **kwds)
[docs] def readFile(self, fh):
"""
Add filter conditions given a file-like object. Not implemented in the
base class.
:type fh: iterable of str
"""
raise NotImplementedError
[docs] def append(self, filter):
"""
Add a filter the PropFilter object.
:type expr: SingleFilter
"""
self.filters.append(filter)
[docs] def checkStructure(self, st_or_mol, max_violations=0):
"""
Return True if st_or_mol passes the filters; False otherwise.
:type st_or_mol: Structure or Mol
:rtype: bool
"""
violations = 0
for filter in self.filters:
mol = self._adaptStructure(st_or_mol)
if not filter.checkStructure(mol):
violations += 1
if violations > max_violations:
return False
return True
[docs] def filter(self, structures, **kwds):
"""
A generator that returns only the structures from 'structures' that
pass the filter conditions.
:type structures: iterable of Structure or Mol
"""
return (st for st in structures if self.checkStructure(st, **kwds))
_mol_adapter = None
_mol_class = None
def _adaptStructure(self, st_or_mol):
"""
Convert st_or_mol to the class specified by cls._mol_class by calling
the cls._mol_adapter function. Does nothing if the object is already
and instance of _mol_class or there is no _mol_adapter.
This function is called by checkStructure before doing the actual
checking of the filters.
:type st_or_mol: Mol or Structure
:rtype: cls._mol_class
"""
if self._mol_adapter and not isinstance(st_or_mol, self._mol_class):
# Optimization: check if it's the same as the last structure
if id(st_or_mol) == self._last_id:
mol = self._last_mol
else:
mol = self._mol_adapter(st_or_mol)
self._last_id = id(st_or_mol)
self._last_mol = mol
return mol
else:
return st_or_mol
[docs]class SinglePropFilter(SingleFilter):
"""
Check if a structure satisfies an expression testing a single property. The
expression uses the syntax supported by $SCHRODINGER/utilities/propfilter.
For example, "r_i_glide_gscore < -5 > -6".
This class and associated functions support both Structure objects and
RDKit Mol objects.
"""
[docs] def __init__(self, expr):
self.instructions = parse_filter_expression(expr)
self.propname = self.instructions.pop(0)
[docs] def checkStructure(self, st_or_mol):
"""
Return True if st_or_mol passes the filters; False otherwise.
:type st_or_mol: Structure or Mol
:rtype: bool
"""
if not _has_prop(st_or_mol, self.propname):
return False
if not self.instructions:
return True
# Run the instructions returned by parse_filter_expression
prop_val = _get_prop(st_or_mol, self.propname)
result_stack = []
for inst in self.instructions:
op = inst[0]
if len(inst) == 2:
left = prop_val
right = type(prop_val)(inst[1])
else:
right, left = result_stack.pop(), result_stack.pop()
result_stack.append(OPERATORS[op](left, right))
return result_stack.pop()
[docs]class PropFilter(Filter):
"""
Check if a structure satisfies a given list of conditions. Each condition is
expressed using the syntax supported by $SCHRODINGER/utilities/propfilter.
For example, "r_i_glide_gscore < -5 > -6".
This class and associated functions support both Structure objects and
RDKit Mol objects.
"""
[docs] def readFile(self, fh):
"""
Add the filter conditions given a file-like object.
:type fh: iterable of str
"""
logical_line = ''
for line in fh:
line = line.strip()
if line.startswith('# ') or not line:
continue # comment or blank line
logical_line += line
if line.endswith('\\'):
logical_line = logical_line[:-1] # incomplete line
continue
filter = SinglePropFilter(logical_line)
self.append(filter)
logical_line = ''
[docs] def getPropertyNames(self):
"""
Return the set of properties used by all the filters in this object.
:rtype: set of str
"""
return {f.propname for f in self.filters}
[docs]class SingleSmartsFilter(SingleFilter):
"""
Check if a structure matches a SMARTS pattern a given number of times.
"""
[docs] def __init__(self, smarts, name, min_matches, max_matches):
self.name = name
self.min_matches = min_matches
self.max_matches = max_matches
self.smarts = smarts
self.patt = Chem.MolFromSmarts(smarts)
if self.patt is None:
raise ValueError('Invalid SMARTS: %s' % smarts)
[docs] def checkStructure(self, mol):
"""
Return True if st_or_mol passes the filters; False otherwise.
:type st: Mol
:rtype: bool
"""
matches = mol.GetSubstructMatches(self.patt)
count = len(matches)
return count >= self.min_matches and count <= self.max_matches
[docs]class SmartsFilter(Filter):
"""
Check if a structure satisfies a given list of SMARTS filters. Supports
reading canvasSearch rule files.
"""
_mol_adapter = functools.partial(rdkit_adapter.to_rdkit,
include_properties=False,
include_coordinates=False)
_mol_class = Chem.Mol
[docs] def readFile(self, fh, delimiter='\t'):
reader = csv.reader(fh, dialect='excel-tab', delimiter=delimiter)
for linenum, row in enumerate(reader, 1):
ncols = len(row)
if len(row) == 0:
continue
smarts = row[0]
try:
min_matches = int(row[1]) if ncols > 1 else 0
max_matches = int(row[2]) if ncols > 2 else 0
name = row[3] if ncols > 3 else ''
sf = SingleSmartsFilter(smarts, name, min_matches, max_matches)
except ValueError as err:
# May be due to bad int or bad SMARTS.
raise ValueError("%d: %s" % (linenum, err))
self.append(sf)
OPERATORS = {
'>': operator.gt,
'>=': operator.ge,
'<': operator.lt,
'<=': operator.le,
'==': operator.eq,
'!=': operator.ne,
'~': lambda l, r: bool(re.search(r, l, re.I)),
'!~': lambda l, r: not re.search(r, l, re.I),
'AND': operator.and_,
'OR': operator.or_,
}
# Parser state variables
_last_op = None
_instructions = []
_pp_parser = None # cached parser object
def _op_action(t):
global _last_op
_last_op = t[0]
def _and_action(t):
_instructions.append(("AND",))
def _or_action(t):
_instructions.append(("OR",))
def _push_first(t):
_instructions.append(t[0])
def _term_action(t):
if len(t) == 1:
t.insert(0, _last_op)
op, val = t
_instructions.append((op, val))
return t
def _get_filter_parser():
"""
Return a parser for propfilter expressions.
:rtype: pyparsing.ParserElement
"""
global _pp_parser
if not _pp_parser:
point = pp.Literal('.')
e = pp.CaselessLiteral('E')
plusorminus = pp.Literal('+') | pp.Literal('-')
number = pp.Word(pp.nums)
integer = pp.Combine(pp.Optional(plusorminus) + number)
floatnumber = pp.Combine(integer +
pp.Optional(point + pp.Optional(number)) +
pp.Optional(e + integer))
string = pp.quotedString.setParseAction(pp.removeQuotes) | pp.Word(
pp.printables)
op = (pp.Literal('<=') | pp.Literal('<') | pp.Literal('>=') |
pp.Literal('>') | pp.Literal('==') | pp.Literal('!=') |
pp.Literal('~') | pp.Literal('!~')).setParseAction(_op_action)
and_op = pp.Literal('AND')
or_op = pp.Literal('OR')
property_name = pp.Word(pp.printables).setParseAction(_push_first)
value = floatnumber | pp.NotAny(or_op | and_op) + string
term = (pp.Optional(op) + value).setParseAction(_term_action)
first_term = (op + value).setParseAction(_term_action)
and_term = (pp.Optional(and_op) + term).setParseAction(_and_action)
and_expr = term + pp.ZeroOrMore(and_term)
first_and_expr = first_term + pp.ZeroOrMore(and_term)
orexpr = first_and_expr + pp.ZeroOrMore(
(or_op + and_expr).setParseAction(_or_action))
_pp_parser = property_name + pp.Optional(orexpr) + pp.StringEnd()
return _pp_parser
def _prop_basename(propname):
"""
Strip the <type>_rdkit_ prefix from a property name.
"""
return re.sub(r'^._rdkit_', '', propname)
def _has_prop(st_or_mol, propname):
"""
Check whether a structure has a property.
:type st_or_mol: Structure or Mol
:type propname: str
:rtype: bool
"""
try:
return propname in st_or_mol.property
except AttributeError:
return (st_or_mol.HasProp(propname) or
st_or_mol.HasProp(_prop_basename(propname)))
def _get_prop(st_or_mol, propname):
"""
Return the value of a property from a structure, which may either be a
Structure or a Mol. For the latter, values that look like numbers are
returned as floats and anything else as strings.
:type st_or_mol: Structure or Mol
:type propname: str
:rtype: float or str or int or bool
"""
try:
return st_or_mol.property[propname]
except AttributeError:
try:
val = st_or_mol.GetProp(propname)
except KeyError:
val = st_or_mol.GetProp(_prop_basename(propname))
try:
return float(val)
except ValueError:
return val
[docs]def parse_filter_expression(s, verbose=False):
"""
Given a filter expression, return a list of instructions for a stack-based
machine.
The first instruction is a property name. The others are tuples of an
operator optionally followed by a value to be compared against the property
value. When the tuple only has an operator, it is applied to two values
popped from the result stack. For example, "r_i_glide_gscore < -5 > -6'
produces ['r_i_glide_gscore', ('<', '-5'), ('>', '-6'), ('AND',)].
:type s: str
:rtype: list
"""
del _instructions[:]
results = _get_filter_parser().parseString(s)
stack = _instructions[:]
logger.debug("Tokens: %s", results)
logger.debug("Instructions: %s", stack)
return stack