Source code for schrodinger.utils.ligfilter

"""
Support module for Ligfilter applications, including parsing functions,
filtering criteria, constants, and setting up of the default composite
SMARTS patterns.

The basic idea is to provide a set of criteria for filtering structures
based on properties, function evaluation, or collections of SMARTS patterns.
These criteria can be easily specified in an external file.

Examples of criteria definitons:

    Molecular_weight < 300  A predefined criterion type
    i_qp_#amide >= 1        A property-based criterion
    Alcohols == 0           A SMARTS definition matching criterion
    s_sd_Asinex             A check for the existence of a property

General terminology used in the documentation of this module:
    - SMARTS expression - a SMARTS string
    - DEFINITION - a named definition, which can be simple (i.e., just a
      SMARTS expression) or composite (including/excluding multiple
      definitions, whether simple or composite).
    - KEY - a definition name or predefined function (e.g., Num_atoms)
    - CRITERION - a filtering condition

Copyright Schrodinger, LLC. All rights reserved.

"""

# Contributors: Jeff A. Saunders, Matvey Adzhigirey

# ToDo:
#
# Pull out the actual filtering code into a support class, so this
# functionality can be accessed without running the utility.
#
# Rename the default definition file

import os

import schrodinger.job.util as jobutil  # For hunt
from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.structutils import analyze

#
# Global constants
#
DEFAULT_PATTERNS_FILE = "ligfilter_definitions.lff"
FILTERFILE_EXT = "lff"

#NOTE: '<'&'>' must be last for mysplit() to work correctly:
OPERATORS = ['==', '!=', '<=', '>=', '<', '>']
GATES = ['AND', 'OR']

PROPERTY = 'property'
PREDEFINED = 'predefined'
SMARTS = 'smarts'
ASL = 'asl'


class _MySplit:

    def __init__(self):
        pass

    def parse(self, thestr):
        self.thestr = thestr
        self.i = -1

        SPACES = [' ', '\t', '\n']
        outlist = []

        char = self.nextChar()
        while char is not None:
            if char in SPACES:
                char = self.nextChar()
                continue

            # Starting a double-quoted string
            if char == '"':
                currstr = char
                while True:
                    char = self.nextChar()
                    if char is None:
                        break
                    currstr += char  # Add quote or any other char
                    if char == '"':
                        break

                # Append string including the quotes:
                outlist.append(currstr)

                if char is not None:
                    # Get the next char after the ending quote:
                    char = self.nextChar()
                # Pass last char to next loop:
                continue

            # Starting a single-quoted string
            if char == "'":
                currstr = char
                while True:
                    char = self.nextChar()
                    if char is None:
                        break
                    currstr += char  # Add quote or any other char
                    if char == "'":
                        break

                # Append string including the quotes:
                outlist.append(currstr)

                if char is not None:
                    # Get the next char after the ending quote:
                    char = self.nextChar()
                # Pass last char to next loop:
                continue

            # Starting a value or property name:
            currstr = ''
            while True:
                currstr += char
                char = self.nextChar()
                if char is None or char in SPACES:
                    break

            outlist.append(currstr)

            # Pass last char to next loop:
            continue

        return outlist

    def nextChar(self):
        self.i += 1
        try:
            return self.thestr[self.i]
        except IndexError:
            return None


[docs]def mysplit(thestr): """ Special version of thestr.split() The following string: "criteria<value" will be split into: ["criteria", "<", "value"] Implemented so that spaces would not longer be required in criteria. """ #print '\nBEFORE: "%s"' % thestr string_list = [] currstr = '' skip = False for i, char in enumerate(thestr): if skip: skip = False continue if char not in ['=', '!', '<', '>', '-']: currstr += char continue # Reached the first char in operator # Read next char: try: nextchar = thestr[i + 1] except IndexError: nextchar = None if nextchar == '=': if currstr: string_list.append(currstr) string_list.append(char + nextchar) currstr = '' skip = True elif char in ['<', '>']: if currstr: string_list.append(currstr) string_list.append(char) currstr = '' else: # It's not really a token: currstr += char if currstr: string_list.append(currstr) #print 'OPERATOR SPLIT:', string_list outlist = [] for thestr in string_list: if thestr in OPERATORS: outlist.append(thestr) else: s = _MySplit().parse(thestr) outlist.extend(s) #print 'AFTER:', outlist return outlist
[docs]class Criterion: """ A base class for a structure matching criterion. Each instance will test a structure for some property and indicate whether it passes or not. Attributes type - The classification of the Criterion. Can be PREDEFINED, PROPERTY, or SMARTS. """
[docs] def __init__(self, name=None, compstr=None): """ Parameters name - the name of the Criterion. See subclasses for meaning, as it depends on the implementation. compstr - a comparison string for evaluating the value of the named property. Examples are 'VALUE < 300' or 'VALUE >= 1'. If name or compstr are not specified, parseLine() method should be used If a PROPERTY Criterion has no operator or value, the Criterion is just the existence of the property in the tested structure. The reason why comstr is one string instead of two values (operator and number) is in order to support implementation of Ev:50600 - Add the ability to create criteria with multiple (boolean) conditions """ self.name = name self.setCompStr(compstr)
[docs] def setCompStr(self, compstr): """ Set the compstr attributes according to specified line Raises RuntimeError if the string is invalid """ # self._compstr is equal to self.compstr except that it has "value" # entered before the operator, and has 'and'/'or' instead of 'AND'/'OR': if not compstr: # Empty (PropertyCriterion or AslCriterior only) self.compstr = None self._compstr = None return # Unique case for -add_descriptors option. Causes this criteria to # always match, no matter what the value is. if compstr == 'True': self.compstr = 'True' self._compstr = 'True' return # compstr must be a series of "OPERATOR VALUE" separated by gates (AND/OR) GATE, OPERATOR, VALUE = list(range(3)) # constants modified_compstr = '' #print 'COMPSTR:', compstr # First token MUST be OPERATOR: expected_token = OPERATOR for token in mysplit(compstr): if expected_token == OPERATOR: # Expecting '==', '!=', '<=', '>=', '<', '>': if token not in OPERATORS: raise RuntimeError('TOKEN "%s" not an operator' % token) if self.name.startswith('s_'): if token not in ['==', '!=']: raise RuntimeError( 'String properties only support operators "==" and "!="' ) expected_token = VALUE # "value" is important for match_compstr() to work right: modified_compstr += " value %s" % token elif expected_token == VALUE: # Check to make sure the value is supported for this criterion: if self.type == PROPERTY: if self.name.startswith('b_'): if token not in ['True', 'False']: raise RuntimeError( 'TOKEN "%s" must be a value of True or False' % token) elif self.name.startswith('i_'): try: token = int(token) except ValueError: raise RuntimeError('TOKEN "%s" not an int value' % token) elif self.name.startswith('f_'): try: token = float(token) except ValueError: raise RuntimeError('TOKEN "%s" not a float value' % token) elif self.name.startswith('s_'): if not token.startswith('"') and not token.startswith( "'"): raise RuntimeError( 'TOKEN "%s": Value for string properties must be quoted' ) elif self.type == SMARTS: try: token = int(token) except ValueError: raise RuntimeError('TOKEN "%s" not an int value' % token) else: # Predefined try: token = float(token) except ValueError: raise RuntimeError('TOKEN "%s" not a number value' % token) expected_token = GATE modified_compstr += " %s" % token elif expected_token == GATE: # Expecting AND or OR if token not in GATES: raise RuntimeError('TOKEN "%s" not a gate' % token) expected_token = OPERATOR # convert 'AND'/'OR' to 'and'/'or': modified_compstr += " %s" % token.lower() # Last token MUST be VALUE: if expected_token != GATE: raise RuntimeError('last token is not a value') # compstr was verified to be OK at this point #print '_COMPSTR:', modified_compstr self.compstr = compstr self._compstr = modified_compstr
[docs] def parseLine(self, line): """ Parse a line of the form: <name> (Property criterion only) or: <name> <oper> <value> or: <name> <oper> <value> AND/OR <oper> <value> Set the name and compstr attributes from the parsed line; Raises RuntimeError if the string is invalid """ s = mysplit(line) self.name = s[0] if len(s) < 1: raise RuntimeError("ERROR Empty line encountered.") if len(s) == 1 and self.type == PROPERTY: self.setCompStr('') elif len(s) == 2: if len(s[1].split('-')) == 1: s.insert(1, '==') self.setCompStr(' '.join(s[1:])) elif len(s[1].split('-')) == 2: # Support for dashes specifying value ranges (required by LigPrep): val = s[1].split('-') s[1] = '>=' s.append(val[0]) s.append('AND') s.append('<=') s.append(val[1]) self.setCompStr(' '.join(s[1:])) else: raise RuntimeError("ERROR Failed to parse: %s" % line) else: # > 3 self.setCompStr(' '.join(s[1:]))
def __str__(self): """ Return a standard string form of the Criterion suitable for filter files. """ if self.compstr: return f"{self.name:<40} {self.compstr}" else: return "%-40s" % self.name
[docs] def match_compstr(self, value): """ Return True if the value matches self.compstr, False if not. """ matches = value # for pychecker matches = eval(self._compstr) return matches
[docs] def matches(self, st, addprops=False): """ Return True if the structure 'st' matches the criterion, False if not. OVERWRITE this method in the subclass st (Structure) - Structure object addprops (bool) - whether to add properties for each description """
[docs] def getvalue(self, st): """ Return the value of this criterion in the structure 'st'. OVERWRITE this method in the subclass """
############################################################################## # Functions to generate pre-defined criteria: # Each one returns a number (int or float) for predified criteria # for the specified structure object ##############################################################################
[docs]def Num_rings(st): """ Return the number of rings in the structure 'st'. """ return len(st.ring)
[docs]def Num_aromatic_rings(st): """ Return the number of aromatic rings in the structure 'st'. """ num_arom = 0 for ring in st.ring: if ring.isAromatic(): num_arom += 1 return num_arom
[docs]def Num_aliphatic_rings(st): """ Return the number of aliphatic rings in the structure 'st'. """ num_alip = 0 for ring in st.ring: if not ring.isAromatic(): num_alip += 1 return num_alip
[docs]def Num_heteroaromatic_rings(st): """ Return the number of aromatic rings containing heteroatoms (N, O, S) in the structure 'st'. """ num_heter = 0 for ring in st.ring: if ring.isHeteroaromatic(): num_heter += 1 return num_heter
[docs]def Num_rotatable_bonds(st): """ Return the number of rotatable bonds in the structure 'st', as determined by structutils.analyze.get_num_rotatable_bonds(). """ return analyze.get_num_rotatable_bonds(st)
[docs]def Num_atoms(st): """ Return the number of atoms in the structure 'st'. """ return st.atom_total
[docs]def Num_heavy_atoms(st): """ Return the number of non-hydrogen atoms in the structure """ total = 0 for atom in st.atom: if atom.atomic_number != 1: total += 1 return total
[docs]def Num_molecules(st): """ Return number of molecules in the structure. """ return st.mol_total
[docs]def Num_residues(st): """ Return number of residues in the structure. """ return len(st.residue)
[docs]def Molecular_weight(st): """ Return the total molecular weight of the structure 'st'. """ return st.total_weight
[docs]def Num_chiral_centers(st): """ Return the number of chiral centers in the structure 'st', as determined by structutils.analyze.get_chiral_atoms(). """ return len(analyze.get_chiral_atoms(st))
[docs]def Total_charge(st): """ Return the total formal charge of the structure 'st'. """ return st.formal_charge
[docs]def Num_positive_atoms(st): """ Return the number of positive atoms in the structure 'st'. """ num_pos = 0 for atom in st.atom: if atom.formal_charge > 0: num_pos += 1 return num_pos
[docs]def Num_negative_atoms(st): """ Return the number of negative atoms in the structure 'st'. """ num_neg = 0 for atom in st.atom: if atom.formal_charge < 0: num_neg += 1 return num_neg
[docs]def Molecular_formula(st): return analyze.generate_molecular_formula(st)
def _get_percent_ss_type(st, ss_type): counted = 0 matched = 0 for atom in st.atom: ss = atom.property["i_m_secondary_structure"] if atom.element == 'H': assert ss == structure.SS_NONE continue counted += 1 if ss == ss_type: matched += 1 try: percent = float(matched) * 100.0 / counted except ZeroDivisionError: percent = 0.0 return round(percent, 6)
[docs]def Percent_helix(st): return _get_percent_ss_type(st, structure.SS_HELIX)
[docs]def Percent_strand(st): return _get_percent_ss_type(st, structure.SS_STRAND)
[docs]def Percent_loop(st): return _get_percent_ss_type(st, structure.SS_LOOP)
#### Dictionary linking Predefined "name" to a module function: # Please document any additions in the PredefinedCriterion class docstring. predefined_function_dict = { 'Num_rings': Num_rings, 'Num_aromatic_rings': Num_aromatic_rings, 'Num_aliphatic_rings': Num_aliphatic_rings, 'Num_heteroaromatic_rings': Num_heteroaromatic_rings, 'Num_rotatable_bonds': Num_rotatable_bonds, 'Num_atoms': Num_atoms, 'Molecular_weight': Molecular_weight, 'Num_chiral_centers': Num_chiral_centers, 'Total_charge': Total_charge, 'Num_positive_atoms': Num_positive_atoms, 'Num_negative_atoms': Num_negative_atoms, 'Num_heavy_atoms': Num_heavy_atoms, 'Num_molecules': Num_molecules, 'Num_residues': Num_residues, 'Molecular_formula': Molecular_formula, 'Percent_helix': Percent_helix, 'Percent_strand': Percent_strand, 'Percent_loop': Percent_loop, } PREDEFINED_KEYS = list(predefined_function_dict) ############################################################################## ##############################################################################
[docs]class PropertyCriterion(Criterion): """ A structure matching criterion that acts on the presence or value of a specific structure property. If no comparison string is provided, the criterion will check for the presence of property 'name'. Otherwise it will compare the value against the comparison string definition. """
[docs] def __init__(self, name=None, compstr=None): """ Parameters name - the name of the property being evaluated compstr - the property comparison string to be used if present currently in format "<operator> <value>" If name or compstr are not specified, parseLine() method should be used """ self.type = PROPERTY Criterion.__init__(self, name, compstr)
[docs] def matches(self, st, addprops=False): """ Return True if structure 'st' matches this criterion, False if not. st (Structure) - Structure object addprops (bool) - ignored for property criterions """ # This docstring should be kept in sync with other 'matches' # docstrings. # FIXME make <addprops> work try: value = st.property[self.name] except KeyError: # property does not exist == Doesn't match. return False else: # Property exists. if not self.compstr: # We just wanted to check for existance of property return True else: # We wanted to know if the value for property is within range # Check is value is within range: return self.match_compstr(value)
[docs] def getvalue(self, st): """ Return the value of the property for this structure. Returns None if the property does not exist. """ try: return st.property[self.name] except KeyError: # property does not exist return None
[docs]class SmartsCriterion(Criterion): """ A structure matching criterion that looks for a match to a Definition instance, which is comprised of a collection of SMARTS patterns. For example, for the Definition 'TwoCarbons' that matches against the SMARTS patterns [#6][#6], the comparison string TwoCarbons < 40 will match if there are less than 40 carbon-carbon bonds in the structure. """
[docs] def __init__(self, definition, compstr=None): """ Parameters definition - a Definition instance, specifying the SMARTS pattern(s) to be included and excluded compstr - the comparison string to be used. Currently equal to "<operator> <value>" Used by the expand() method If compstr are not specified, parseLine() method should be used """ self.type = SMARTS Criterion.__init__(self, definition.name, compstr) self.definition = definition
def _count_occurances(self, st): """ Retrieve a count of 'includes' matches in the structure, but do not count any matches that have atoms that are also present in any 'excludes' matches. """ # Get the atomsets for groups that match includes SMARTS pattern. # For each group, see if it also matches the excludes patterns. # If so, do not count it. Note, that excludes will need to look at # not only the atoms matching includes, but surrounding atoms as well. # Currently this works correctly only for single definitions, NOT # compound definitions (groups of simple definitions) inc = self.definition.includes() exc = self.definition.excludes() try: inlist = analyze.evaluate_multiple_smarts(st, inc) exlist = analyze.evaluate_multiple_smarts(st, exc) except ValueError as err: raise RuntimeError(err) # Remove duplicate matches (Ev:84352): inset = set() for atoms in inlist: inset.add(tuple(sorted(atoms))) exset = set() for atoms in exlist: exset.add(tuple(sorted(atoms))) # Check to see if any inatoms matches any exatoms: num_matches = 0 for inatoms in inset: exclude_inset = False for exatoms in exset: # To remove duplicate matches (Ev:84352): inset_in_exset = True for atom in inatoms: if atom not in exatoms: inset_in_exset = False break if inset_in_exset: exclude_inset = True break # Get out of exatoms loop if not exclude_inset: num_matches += 1 return num_matches
[docs] def matches(self, st, addprops=False): """ Return True if structure 'st' matches this criterion, False if not. Current matching behavior is to count the number of matches in the definition.includes() list, that do not have any overlapping atoms with matches in the definition.excludes() list. st (Structure) - Structure object addprops (bool) - whether to add properties for each description """ if not self.compstr: raise RuntimeError("matches(): compstr not defined") num = self._count_occurances(st) if num < 0: raise RuntimeError( "Structure had more exclude-definition matches than include-definition matches!" ) if addprops: propname = 'i_ligfilter_%s' % self.name st.property[propname] = num return self.match_compstr(num)
[docs] def getvalue(self, st): """ Return the number of times that definition.includes() patterns match the structure but do not overlap with any definition.excludes() patterns. """ return self._count_occurances(st)
[docs] def expand(self, definitions): """ Generate a new SmartsCriterion from the current one in which the definition.includes() and definition.excludes() are expanded from the definitions list. """ newdefinition = self.definition.expand(definitions) return SmartsCriterion(newdefinition, self.compstr)
[docs]class PredefinedCriterion(Criterion): """ A structure matching criterion that acts on the value of a predefined function applied to the structure. Currently available functions are: Num_rings Num_aromatic_rings Num_aliphatic_rings Num_heteroaromatic_rings Num_rotatable_bonds Num_atoms Molecular_weight Num_chiral_centers Total_charge Num_positive_atoms Num_negative_atoms For example, one definition parseable from the external file is: Num_rings == 0 """
[docs] def __init__(self, name=None, compstr=None): """ Parameters name - the name of the function to use. Allowed values are those in ligfilter.PREDEFINED_KEYS. compstr - the comparison string to evaluate the result of the predefined function against """ self.type = PREDEFINED Criterion.__init__(self, name, compstr) self._function = None
[docs] def matches(self, st, addprops=False): """ Return True if structure 'st' matches this criterion, False if not. st (Structure) - Structure object addprops (bool) - whether to add properties for each description """ # This docstring should be kept in sync with other 'matches' # docstrings. if not self.compstr: raise RuntimeError("matches(): compstr not defined") if not self._function: self._function = predefined_function_dict[self.name] num = self._function(st) if addprops: if type(num) == type(0): propname = 'i_ligfilter_%s' % self.name elif type(num) == type(0.0): propname = 'r_ligfilter_%s' % self.name elif isinstance(num, str): # e.g. Molecular_formula propname = 's_ligfilter_%s' % self.name else: raise ValueError("Invalid type: %s" % num) st.property[propname] = num return self.match_compstr(num)
[docs] def getvalue(self, st): """ Return the value of the predefined function applied to 'st'. For example, return the number of rings, or number of rotatable bonds. """ if not self._function: self._function = predefined_function_dict[self.name] num = self._function(st) return num
[docs]class AslCriterion(Criterion): """ This criterion considers a Structure as matching if the stored ASL expresson match returns at least one atom. """
[docs] def __init__(self, asl): """ Parameters asl - the ASL expression string. """ Criterion.__init__(self, asl) self.type = PREDEFINED
[docs] def matches(self, st, addprops=False): """ Return True if structure 'st' matches this ASL criterion, False if not. st (Structure) - Structure object addprops (bool) - whether to add properties for each description """ matched = self.getvalue(st) if addprops: propname = 'b_ligfilter_asl' st.property[propname] = matched return matched
[docs] def getvalue(self, st): """ Return True if the structure 'st' matches this ASL. Flase otherwise. """ matched_atoms = analyze.evaluate_asl(st, self.name) return bool(len(matched_atoms) > 0)
[docs]class Definition: """ A class that defines a collection of SMARTS patterns for matching against. The includes() method returns a list of those patterns that should be matched, and the excludes() method returns those that shouldn't. """
[docs] def __init__( self, name, includes=[], # noqa: M511 excludes=[], # noqa: M511 group=None): """ Parameters includes - a list of SMARTS patterns to count excludes - a list of SMARTS patterns that can be used to exclude matches in the includes list group - name of the group that this definition is part of (optional). See Ev:50599 """ self.name = name # Make a copy of the lists so that originals don't get modified: self._includes = includes[:] self._excludes = excludes[:] self.group = group
[docs] def addKey(self, key, positive=True): """ Add the SMARTS pattern 'key' to the list of desired matches (includes) if 'positive' is True, and to the list of unwanted matches (excludes) if 'positive' is False. """ if positive: self._includes.append(key) else: self._excludes.append(key)
[docs] def removeKey(self, key): """ Remove the SMARTS pattern 'key' from the wanted or unwanted matches list. """ if key in self._includes: self._includes.remove(key) elif key in self._excludes: self._excludes.remove(key) else: raise RuntimeError("Key '{}' not in definition '{}'".format( key, self.name))
[docs] def includes(self): """ Return a list of wanted matches. """ return self._includes
[docs] def excludes(self): """ Return a list of unwanted matches. """ return self._excludes
def _expandIncludes(self, inclist, definitions, masterlist): """ For the provided list of Definitions 'inclist', expand any composite definitions, using the contents of the 'definitions' list of Definitions. Return the expanded definitions via the parameter 'masterlist'. For example, if a Definition TwoCarbons in 'definitions' is made up of the "includes" [C][C] and [c][c], and 'inclist' includes TwoCarbons, then [C][C] and [c][c] will be added to 'masterlist'. """ # TODO: Do we need a guards against circular or contradictory # definitions? # Modify the masterlist - probably not a good programming practice: for inc in inclist: if inc in definitions: self._expandIncludes(definitions[inc]._includes, definitions, masterlist) else: masterlist.append(inc) def _expandExcludes(self, exclist, definitions, masterlist): """ For the provided list of Definions 'exclist', expand any composite definitions, using the contents of the 'definitions' list of Definitions. Return the expanded definitions via the parameter 'masterlist'. Note that the expansion uses the "includes" of the 'definitions' list to generate the masterlist. For example, if a Definition TwoCarbons in 'definitions' is made up of the "includes" [C][C] and [c][c], and 'exclist' includes TwoCarbons, then [C][C] and [c][c] will be added to 'masterlist'. """ # TODO: The behavior described above may not be correct. Should we # prohibit excluded definitions that themselves exclude other # definitions? # TODO: Do we need a guards against circular or contradictory # definitions? # Modify the masterlist - probably not a good programming practice: for exc in exclist: if exc in definitions: if definitions[exc]._excludes: raise RuntimeError( "Error excluding '%s' -- a definition with excludes (-) cannot be included in another definition" % exc) self._expandIncludes(definitions[exc]._includes, definitions, masterlist) else: masterlist.append(exc)
[docs] def expand(self, definitions): """ Generate a new Definition from the current one in which the includes and excludes are expanded from the provided 'definitions' dictionary. """ newincludes = [] self._expandIncludes(self._includes, definitions, newincludes) newexcludes = [] self._expandExcludes(self._excludes, definitions, newexcludes) return Definition(self.name, newincludes, newexcludes)
def __str__(self): """ Return a standard string form of the Definition suitable for filter/keys files. """ s = [] s.append("DEFINE %s" % self.name) if len(self._includes) == 1 and not self._excludes: s.append(self._includes[0]) return " ".join(s) else: for i in self._includes: s.append(" + %s" % i) for e in self._excludes: s.append(" - %s" % e) return '\n'.join(s)
[docs]class CriterionParser: """ A class for parsing a general property or predefined matching criterion. """
[docs] def __init__(self, definitions_dict=None): self.definitions_dict = definitions_dict self.line = None self.line_num = None
[docs] def error(self, err): """ Print the error and exit. """ if self.line_num: err = 'Line %i: %s' % (self.line_num, err) raise RuntimeError(err)
[docs] def expression_error(self, msg): """ Print an error about an invalid expression and exit. """ self.error("\n ".join([ "ERROR: Cound not parse expression due to:", msg, "Expression: %s" % self.line, "Ligfilter expression needs to be in format: <definition> [<operator> <value>];", "where <definition> is a property name or SMARTS definition (no spaces allowed)", "Multiple [<operator> <value>] sets must be separated with 'AND' or 'OR'" ]))
[docs] def parse(self, line, line_num=None): """ Create a Criterion object from a string. The method expects an input line of the form <name> ...or... <name> <operator> <value> The first form is valid only for property criteria. If the instance has a 'definitions_dict', definition criteria will be checked against it for validity. Returns a Criterion. """ self.line = line self.line_num = line_num try: name = mysplit(line)[0] except IndexError: msg = 'parse(): Could not parse line: "%s"' % line raise RuntimeError(msg) criterion = None # so it goes into this namespace if line[:2] in ['b_', 's_', 'i_', 'r_']: # Property #self.error("Only integer or real type properties can be compared.") criterion = PropertyCriterion() else: # Definition (SMARTS) or predefined criterion if name in PREDEFINED_KEYS: criterion = PredefinedCriterion() else: # Probably SMARTS if self.definitions_dict and name not in self.definitions_dict: self.error("Unknown definition or property: %s" % name) elif self.definitions_dict: criterion = SmartsCriterion(self.definitions_dict[name]) else: # Unvalidated criterion. Should this be allowed at all? criterion = SmartsCriterion(Definition(name)) try: criterion.parseLine(line) except RuntimeError as err: self.expression_error(str(err)) return criterion
[docs]class DefinitionParser: """ A class for parsing a (possibly multi-line) specification of a Definition. """
[docs] def __init__(self): self.lines = None self.line_num = None
[docs] def error(self, err): """ Print an error and exit. """ if self.line_num: err = 'Line %i: %s' % err print(err) raise RuntimeError(err)
[docs] def parse(self, lines, line_num=None, group=None): """ Return a Definition from a list of lines. No expansion of definitions is done. General pattern of the specification is DEFINE <name> <SMARTS pattern> or DEFINE <name> (+ include_definition)* (- exclude_definition)* Where the asterisk indicates zero or more of each of the include and exclude definitions. Options: line_num - current line of the file being parsed (for error handling) group - name of the definition group (or None, if there is no group) """ firstline = lines[0] s = mysplit(firstline) if s[0] != 'DEFINE': self.error("Definition expression must start with 'DEFINE'") if len(s) < 2 or len(s) > 3: self.error("Invalid DEFINE statement: %s" % firstline) name = s[1] # The second "word" of the line if name in PREDEFINED_KEYS: self.error("Cannot redefine built-in definition: %s" % name) if name[0:2] in ['s_', 'i_', 'r_', 'b_']: self.error("Invalid definition name: %s" % name) if len(s) == 3: # 3 "words" on this line # Single-line definition. Value should be a SMARTS definition. return Definition(name, includes=[s[2]], group=group) else: # Multi-line definition definition = Definition(name, group=group) for line in lines[1:]: line_num += 1 s = mysplit(line) if not s: # Blank line continue if len(s) != 2: # Have to have 2 "words" on each line self.error("Invalid definition line: %s" % line) inc_exc = s[0] value = s[1] if inc_exc == '+': definition.addKey(value) elif inc_exc == '-': definition.addKey(value, False) else: self.error("Invalid definition line: %s" % line) if not definition.includes(): self.error("Definition '%s' has no includes" % definition.name) return definition
[docs]def read_keys(fh, validate=False, validdefinitions=None): """ Generate lists of Definitions and Criteria from an iterator 'fh' that returns a line at a time of the Definition and Criteria specification. For example, this iterator can be an open file or a list of strings. If 'validate' is True, definition names in criteria will be checked against known Definitions, including those previously read from 'fh' and passed in via 'validdefinitions'. No expansion of Definitions is done. Return a tuple of (Definition list, Criterion list). """ # Make a copy to avoid editing original: if validdefinitions: myvaliddefinitions = validdefinitions[:] else: myvaliddefinitions = [] criteria = [] definitions = [] line_num = 0 currentdefinitionlines = [] currentdefinitionlinenum = 0 currgroup = None dp = DefinitionParser() for line in fh: line = line.strip() # Get rid of leading and trailing spaces/tabs line_num += 1 # Skip "#" lines and lines beginning with "# " (comments): # Note: Comments MUST have a space after #. This is implemented # to support defenitions that start with a pound. if not line or line == "#" or line.startswith('# '): continue line = line.split('# ')[0] # Delete trailing comment s = mysplit(line) if s[0] == "GROUP": # The group is everything after the space following the keyword: # (spaces are supported) currgroup = line[6:] elif s[0] == "GROUPEND": currgroup = None elif s[0] == 'DEFINE': if currentdefinitionlines: # The definition we were reading is done newdefinition = dp.parse(currentdefinitionlines, currentdefinitionlinenum, group=currgroup) # Start the new definition definition definitions.append(newdefinition) myvaliddefinitions.append(newdefinition) currentdefinitionlines = [line] currentdefinitionlinenum = line_num elif s[0] == '+': currentdefinitionlines.append(line) elif s[0] == '-': currentdefinitionlines.append(line) else: # Criterion if currentdefinitionlines: # The definition we were reading is done. newdefinition = dp.parse(currentdefinitionlines, currentdefinitionlinenum, group=currgroup) definitions.append(newdefinition) myvaliddefinitions.append(newdefinition) currentdefinitionlines = [] currentdefinitionlinenum = 0 if validate: definitions_dict = {} for d in myvaliddefinitions: definitions_dict[d.name] = d cp = CriterionParser(definitions_dict) newcriterion = cp.parse(line, line_num) else: cp = CriterionParser() newcriterion = cp.parse(line, line_num) criteria.append(newcriterion) if currentdefinitionlines: # The definition we were reading is done. newdefinition = dp.parse(currentdefinitionlines, currentdefinitionlinenum, group=currgroup) definitions.append(newdefinition) myvaliddefinitions.append(newdefinition) return (definitions, criteria)
# # Find and read the default definitions from the Schrodinger installation. # try: SCHRODINGER = os.environ['SCHRODINGER'] except KeyError: raise Exception("SCHRODINGER is not defined.") if 'MMSHARE_EXEC' in os.environ: MMSHARE_EXEC = os.environ['MMSHARE_EXEC'] else: # Hunt for mmshare: try: MMSHARE_EXEC = jobutil.hunt("mmshare") except: MMSHARE_EXEC = None if not MMSHARE_EXEC: raise Exception("Could not determine MMSHARE_EXEC.") # Search for the standard LigFilter definitions file: # First search in CWD, then in mmshare: MMSHARE_datadir = os.path.join(MMSHARE_EXEC, "..", "..", "data") mmshare_data_file = os.path.join(MMSHARE_datadir, DEFAULT_PATTERNS_FILE) try: appdata_dir = mm.mmfile_schrodinger_appdata_dir() except: raise RuntimeError( "Could not determine the Schrodinger application data directory.") appdata_file = os.path.join(appdata_dir, DEFAULT_PATTERNS_FILE) # Search: 1) CWD, 2) .schrodinger 3) MMSHARE/data: if os.path.isfile(DEFAULT_PATTERNS_FILE): _definition_file_name = DEFAULT_PATTERNS_FILE print('Using local copy of %s' % DEFAULT_PATTERNS_FILE) elif os.path.isfile(appdata_file): _definition_file_name = appdata_file else: _definition_file_name = mmshare_data_file # Open the definition file: try: _definition_fh = open(_definition_file_name) except: raise Exception("Failed to open %s file." % _definition_file_name) # Read the definitions (and criteria, if present) from the definitions file: (default_definitions, _criteria) = read_keys(_definition_fh, validate=True) _definition_fh.close() if _criteria: print("") print("WARNING: There are criteria in the default definitions file. They") print(" will be ignored.") print("")
[docs]def get_default_criterion_parser(): """ Returns a CriterionParser with default definitions """ # Get the default definition definitions definitions_dict = {} for definition in default_definitions: definitions_dict[definition.name] = definition # Expand the definitions (and criteria) before making the matches # It might be necessary only to expand the definitions in the Criterion # objects, since the definitions_dict doesn't get used after this. newdict = {} for dname, d in definitions_dict.items(): newdict[dname] = d.expand(definitions_dict) definitions_dict = newdict cp = CriterionParser(definitions_dict) return cp
[docs]def generate_criterion(condition, cp=None): """ Ev:55805 Returns a Criterion object for a specified condition string. Condition string may be something like: "Num_atoms < 100" The returned criterion can be then used as follows: if criterion.matches(st): <do> Optionally a CriterionParser (cp) may be specified; otherwise default definitions will be used. """ if cp is None: cp = get_default_criterion_parser() criterion = cp.parse(condition) return criterion
[docs]def st_matches_criteria(st, criteria_list, match_any=False, addprops=False): """ If the specified structure matches the criteria, returns None. If does not match, then a string is returned, explaining the reason. match_any - if True, st is considers to match if it matches at least one criteria; otherwise all criteria must be matched. addprops - if True, properties for each descriptor is added to st. """ # Sort criteria by order of execution (fast execution first): sorted_criteria = [] for c in criteria_list: if c.type == PROPERTY: sorted_criteria.append(c) for c in criteria_list: if c.type == SMARTS: sorted_criteria.append(c) for c in criteria_list: if c.type == PREDEFINED: sorted_criteria.append(c) for c in criteria_list: if c.type == ASL: sorted_criteria.append(c) for c in sorted_criteria: match = c.matches(st, addprops=addprops) if match and match_any: return None if not match and not match_any: return "did not match criteria: %s" % c.name if match_any: # No matches found return "did not match any createria" else: # No non-matches found return None
#EOF