Source code for schrodinger.utils.ligfilter

"""
Support module for Ligfilter applications, including parsing functions,
filtering criteria, constants, and setting up of the default composite
SMARTS patterns.

The basic idea is to provide a set of criteria for filtering structures
based on properties, function evaluation, or collections of SMARTS patterns.
These criteria can be easily specified in an external file.

Examples of criteria definitons:

    Molecular_weight < 300  A predefined criterion type
    i_qp_#amide >= 1        A property-based criterion
    Alcohols == 0           A SMARTS definition matching criterion
    s_sd_Asinex             A check for the existence of a property

General terminology used in the documentation of this module:
    - SMARTS expression - a SMARTS string
    - DEFINITION - a named definition, which can be simple (i.e., just a
      SMARTS expression) or composite (including/excluding multiple
      definitions, whether simple or composite).
    - KEY - a definition name or predefined function (e.g., Num_atoms)
    - CRITERION - a filtering condition

Copyright Schrodinger, LLC. All rights reserved.

"""

# Contributors: Jeff A. Saunders, Matvey Adzhigirey

# ToDo:
#
# Pull out the actual filtering code into a support class, so this
# functionality can be accessed without running the utility.
#
# Rename the default definition file

import os

import schrodinger.job.util as jobutil  # For hunt
from schrodinger import structure
from schrodinger.infra import mm
from schrodinger.structutils import analyze

#
# Global constants
#
DEFAULT_PATTERNS_FILE = "ligfilter_definitions.lff"
FILTERFILE_EXT = "lff"

#NOTE: '<'&'>' must be last for mysplit() to work correctly:
OPERATORS = ['==', '!=', '<=', '>=', '<', '>']
GATES = ['AND', 'OR']

PROPERTY = 'property'
PREDEFINED = 'predefined'
SMARTS = 'smarts'
ASL = 'asl'


class _MySplit:

    def __init__(self):
        pass

    def parse(self, thestr):
        self.thestr = thestr
        self.i = -1

        SPACES = [' ', '\t', '\n']
        outlist = []

        char = self.nextChar()
        while char is not None:
            if char in SPACES:
                char = self.nextChar()
                continue

            # Starting a double-quoted string
            if char == '"':
                currstr = char
                while True:
                    char = self.nextChar()
                    if char is None:
                        break
                    currstr += char  # Add quote or any other char
                    if char == '"':
                        break

                # Append string including the quotes:
                outlist.append(currstr)

                if char is not None:
                    # Get the next char after the ending quote:
                    char = self.nextChar()
                # Pass last char to next loop:
                continue

            # Starting a single-quoted string
            if char == "'":
                currstr = char
                while True:
                    char = self.nextChar()
                    if char is None:
                        break
                    currstr += char  # Add quote or any other char
                    if char == "'":
                        break

                # Append string including the quotes:
                outlist.append(currstr)

                if char is not None:
                    # Get the next char after the ending quote:
                    char = self.nextChar()
                # Pass last char to next loop:
                continue

            # Starting a value or property name:
            currstr = ''
            while True:
                currstr += char
                char = self.nextChar()
                if char is None or char in SPACES:
                    break

            outlist.append(currstr)

            # Pass last char to next loop:
            continue

        return outlist

    def nextChar(self):
        self.i += 1
        try:
            return self.thestr[self.i]
        except IndexError:
            return None


[docs]def mysplit(thestr):
    """
    Special version of thestr.split()

    The following string: "criteria<value" will be split into:
    ["criteria", "<", "value"]

    Implemented so that spaces would not longer be required in criteria.
    """

    #print '\nBEFORE: "%s"' % thestr

    string_list = []
    currstr = ''
    skip = False
    for i, char in enumerate(thestr):
        if skip:
            skip = False
            continue

        if char not in ['=', '!', '<', '>', '-']:
            currstr += char
            continue

        # Reached the first char in operator
        # Read next char:
        try:
            nextchar = thestr[i + 1]
        except IndexError:
            nextchar = None

        if nextchar == '=':
            if currstr:
                string_list.append(currstr)
            string_list.append(char + nextchar)
            currstr = ''
            skip = True
        elif char in ['<', '>']:
            if currstr:
                string_list.append(currstr)
            string_list.append(char)
            currstr = ''

        else:  # It's not really a token:
            currstr += char

    if currstr:
        string_list.append(currstr)

    #print 'OPERATOR SPLIT:', string_list

    outlist = []
    for thestr in string_list:
        if thestr in OPERATORS:
            outlist.append(thestr)
        else:
            s = _MySplit().parse(thestr)
            outlist.extend(s)

    #print 'AFTER:', outlist
    return outlist


[docs]class Criterion:
    """
    A base class for a structure matching criterion. Each instance will test
    a structure for some property and indicate whether it passes or not.

    Attributes

        type - The classification of the Criterion.  Can be PREDEFINED,
            PROPERTY, or SMARTS.

    """

[docs]    def __init__(self, name=None, compstr=None):
        """
        Parameters

            name - the name of the Criterion. See subclasses for meaning, as
                it depends on the implementation.

            compstr - a comparison string for evaluating the value of the
                named property. Examples are 'VALUE < 300' or 'VALUE >= 1'.

        If name or compstr are not specified, parseLine() method should be used

        If a PROPERTY Criterion has no operator or value, the Criterion is
        just the existence of the property in the tested structure.

        The reason why comstr is one string instead of two values (operator and number)
        is in order to support implementation of Ev:50600 - Add the ability to create
        criteria with multiple (boolean) conditions
        """

        self.name = name
        self.setCompStr(compstr)

[docs]    def setCompStr(self, compstr):
        """
        Set the compstr attributes according to specified line
        Raises RuntimeError if the string is invalid
        """

        # self._compstr is equal to self.compstr except that it has "value"
        # entered before the operator, and has 'and'/'or' instead of 'AND'/'OR':

        if not compstr:  # Empty (PropertyCriterion or AslCriterior only)
            self.compstr = None
            self._compstr = None
            return
        # Unique case for -add_descriptors option. Causes this criteria to
        # always match, no matter what the value is.
        if compstr == 'True':
            self.compstr = 'True'
            self._compstr = 'True'
            return

        # compstr must be a series of "OPERATOR VALUE" separated by gates (AND/OR)

        GATE, OPERATOR, VALUE = list(range(3))  # constants

        modified_compstr = ''

        #print 'COMPSTR:', compstr

        # First token MUST be OPERATOR:
        expected_token = OPERATOR
        for token in mysplit(compstr):
            if expected_token == OPERATOR:
                # Expecting '==', '!=', '<=', '>=', '<', '>':
                if token not in OPERATORS:
                    raise RuntimeError('TOKEN "%s" not an operator' % token)

                if self.name.startswith('s_'):
                    if token not in ['==', '!=']:
                        raise RuntimeError(
                            'String properties only support operators "==" and "!="'
                        )

                expected_token = VALUE
                # "value" is important for match_compstr() to work right:
                modified_compstr += " value %s" % token

            elif expected_token == VALUE:
                # Check to make sure the value is supported for this criterion:
                if self.type == PROPERTY:
                    if self.name.startswith('b_'):
                        if token not in ['True', 'False']:
                            raise RuntimeError(
                                'TOKEN "%s" must be a value of True or False' %
                                token)

                    elif self.name.startswith('i_'):
                        try:
                            token = int(token)
                        except ValueError:
                            raise RuntimeError('TOKEN "%s" not an int value' %
                                               token)

                    elif self.name.startswith('f_'):
                        try:
                            token = float(token)
                        except ValueError:
                            raise RuntimeError('TOKEN "%s" not a float value' %
                                               token)

                    elif self.name.startswith('s_'):
                        if not token.startswith('"') and not token.startswith(
                                "'"):
                            raise RuntimeError(
                                'TOKEN "%s": Value for string properties must be quoted'
                            )

                elif self.type == SMARTS:
                    try:
                        token = int(token)
                    except ValueError:
                        raise RuntimeError('TOKEN "%s" not an int value' %
                                           token)

                else:  # Predefined
                    try:
                        token = float(token)
                    except ValueError:
                        raise RuntimeError('TOKEN "%s" not a number value' %
                                           token)

                expected_token = GATE
                modified_compstr += " %s" % token

            elif expected_token == GATE:
                # Expecting AND or OR
                if token not in GATES:
                    raise RuntimeError('TOKEN "%s" not a gate' % token)
                expected_token = OPERATOR
                # convert 'AND'/'OR' to 'and'/'or':
                modified_compstr += " %s" % token.lower()

        # Last token MUST be VALUE:
        if expected_token != GATE:
            raise RuntimeError('last token is not a value')

        # compstr was verified to be OK at this point
        #print '_COMPSTR:', modified_compstr

        self.compstr = compstr
        self._compstr = modified_compstr

[docs]    def parseLine(self, line):
        """
        Parse a line of the form:
            <name>
            (Property criterion only)
        or:
            <name> <oper> <value>
        or:
            <name> <oper> <value> AND/OR <oper> <value>

        Set the name and compstr attributes from the parsed line;
        Raises RuntimeError if the string is invalid
        """
        s = mysplit(line)
        self.name = s[0]

        if len(s) < 1:
            raise RuntimeError("ERROR Empty line encountered.")
        if len(s) == 1 and self.type == PROPERTY:
            self.setCompStr('')
        elif len(s) == 2:
            if len(s[1].split('-')) == 1:
                s.insert(1, '==')
                self.setCompStr(' '.join(s[1:]))
            elif len(s[1].split('-')) == 2:
                # Support for dashes specifying value ranges (required by LigPrep):
                val = s[1].split('-')
                s[1] = '>='
                s.append(val[0])
                s.append('AND')
                s.append('<=')
                s.append(val[1])
                self.setCompStr(' '.join(s[1:]))
            else:
                raise RuntimeError("ERROR Failed to parse: %s" % line)
        else:  # > 3
            self.setCompStr(' '.join(s[1:]))

    def __str__(self):
        """
        Return a standard string form of the Criterion suitable for filter
        files.

        """
        if self.compstr:
            return f"{self.name:<40} {self.compstr}"
        else:
            return "%-40s" % self.name

[docs]    def match_compstr(self, value):
        """
        Return True if the value matches self.compstr, False if not.

        """
        matches = value  # for pychecker
        matches = eval(self._compstr)
        return matches

[docs]    def matches(self, st, addprops=False):
        """
        Return True if the structure 'st' matches the criterion, False if not.
        OVERWRITE this method in the subclass

        st (Structure) - Structure object
        addprops (bool) - whether to add properties for each description
        """

[docs]    def getvalue(self, st):
        """
        Return the value of this criterion in the structure 'st'.
        OVERWRITE this method in the subclass
        """


##############################################################################
# Functions to generate pre-defined criteria:
# Each one returns a number (int or float) for predified criteria
# for the specified structure object
##############################################################################


[docs]def Num_rings(st):
    """
    Return the number of rings in the structure 'st'.

    """
    return len(st.ring)


[docs]def Num_aromatic_rings(st):
    """
    Return the number of aromatic rings in the structure 'st'.

    """
    num_arom = 0
    for ring in st.ring:
        if ring.isAromatic():
            num_arom += 1
    return num_arom


[docs]def Num_aliphatic_rings(st):
    """
    Return the number of aliphatic rings in the structure 'st'.

    """
    num_alip = 0
    for ring in st.ring:
        if not ring.isAromatic():
            num_alip += 1
    return num_alip


[docs]def Num_heteroaromatic_rings(st):
    """
    Return the number of aromatic rings containing heteroatoms (N, O, S) in
    the structure 'st'.

    """
    num_heter = 0
    for ring in st.ring:
        if ring.isHeteroaromatic():
            num_heter += 1
    return num_heter


[docs]def Num_rotatable_bonds(st):
    """
    Return the number of rotatable bonds in the structure 'st', as
    determined by structutils.analyze.get_num_rotatable_bonds().

    """
    return analyze.get_num_rotatable_bonds(st)


[docs]def Num_atoms(st):
    """
    Return the number of atoms in the structure 'st'.

    """
    return st.atom_total


[docs]def Num_heavy_atoms(st):
    """
    Return the number of non-hydrogen atoms in the structure

    """
    total = 0
    for atom in st.atom:
        if atom.atomic_number != 1:
            total += 1
    return total


[docs]def Num_molecules(st):
    """
    Return number of molecules in the structure.

    """
    return st.mol_total


[docs]def Num_residues(st):
    """
    Return number of residues in the structure.

    """
    return len(st.residue)


[docs]def Molecular_weight(st):
    """
    Return the total molecular weight of the structure 'st'.

    """
    return st.total_weight


[docs]def Num_chiral_centers(st):
    """
    Return the number of chiral centers in the structure 'st', as determined
    by structutils.analyze.get_chiral_atoms().

    """
    return len(analyze.get_chiral_atoms(st))


[docs]def Total_charge(st):
    """
    Return the total formal charge of the structure 'st'.

    """
    return st.formal_charge


[docs]def Num_positive_atoms(st):
    """
    Return the number of positive atoms in the structure 'st'.

    """
    num_pos = 0
    for atom in st.atom:
        if atom.formal_charge > 0:
            num_pos += 1
    return num_pos


[docs]def Num_negative_atoms(st):
    """
    Return the number of negative atoms in the structure 'st'.

    """
    num_neg = 0
    for atom in st.atom:
        if atom.formal_charge < 0:
            num_neg += 1
    return num_neg


[docs]def Molecular_formula(st):
    return analyze.generate_molecular_formula(st)


def _get_percent_ss_type(st, ss_type):
    counted = 0
    matched = 0
    for atom in st.atom:
        ss = atom.property["i_m_secondary_structure"]
        if atom.element == 'H':
            assert ss == structure.SS_NONE
            continue
        counted += 1
        if ss == ss_type:
            matched += 1

    try:
        percent = float(matched) * 100.0 / counted
    except ZeroDivisionError:
        percent = 0.0

    return round(percent, 6)


[docs]def Percent_helix(st):
    return _get_percent_ss_type(st, structure.SS_HELIX)


[docs]def Percent_strand(st):
    return _get_percent_ss_type(st, structure.SS_STRAND)


[docs]def Percent_loop(st):
    return _get_percent_ss_type(st, structure.SS_LOOP)


#### Dictionary linking Predefined "name" to a module function:
# Please document any additions in the PredefinedCriterion class docstring.
predefined_function_dict = {
    'Num_rings': Num_rings,
    'Num_aromatic_rings': Num_aromatic_rings,
    'Num_aliphatic_rings': Num_aliphatic_rings,
    'Num_heteroaromatic_rings': Num_heteroaromatic_rings,
    'Num_rotatable_bonds': Num_rotatable_bonds,
    'Num_atoms': Num_atoms,
    'Molecular_weight': Molecular_weight,
    'Num_chiral_centers': Num_chiral_centers,
    'Total_charge': Total_charge,
    'Num_positive_atoms': Num_positive_atoms,
    'Num_negative_atoms': Num_negative_atoms,
    'Num_heavy_atoms': Num_heavy_atoms,
    'Num_molecules': Num_molecules,
    'Num_residues': Num_residues,
    'Molecular_formula': Molecular_formula,
    'Percent_helix': Percent_helix,
    'Percent_strand': Percent_strand,
    'Percent_loop': Percent_loop,
}
PREDEFINED_KEYS = list(predefined_function_dict)

##############################################################################
##############################################################################


[docs]class PropertyCriterion(Criterion):
    """
    A structure matching criterion that acts on the presence or value of a
    specific structure property.

    If no comparison string is provided, the criterion will check for the
    presence of property 'name'. Otherwise it will compare the value against
    the comparison string definition.

    """

[docs]    def __init__(self, name=None, compstr=None):
        """
        Parameters

            name - the name of the property being evaluated

            compstr - the property comparison string to be used if present
                      currently in format "<operator> <value>"

        If name or compstr are not specified, parseLine() method should be used
        """
        self.type = PROPERTY
        Criterion.__init__(self, name, compstr)

[docs]    def matches(self, st, addprops=False):
        """
        Return True if structure 'st' matches this criterion, False if not.

        st (Structure) - Structure object
        addprops (bool) - ignored for property criterions
        """
        # This docstring should be kept in sync with other 'matches'
        # docstrings.

        # FIXME make <addprops> work

        try:
            value = st.property[self.name]
        except KeyError:
            # property does not exist == Doesn't match.
            return False
        else:
            # Property exists.
            if not self.compstr:
                # We just wanted to check for existance of property
                return True
            else:
                # We wanted to know if the value for property is within range
                # Check is value is within range:
                return self.match_compstr(value)

[docs]    def getvalue(self, st):
        """
        Return the value of the property for this structure.
        Returns None if the property does not exist.
        """

        try:
            return st.property[self.name]
        except KeyError:
            # property does not exist
            return None


[docs]class SmartsCriterion(Criterion):
    """
    A structure matching criterion that looks for a match to a Definition
    instance, which is comprised of a collection of SMARTS patterns.

    For example, for the Definition 'TwoCarbons' that matches against the
    SMARTS patterns [#6][#6], the comparison string

        TwoCarbons < 40

    will match if there are less than 40 carbon-carbon bonds in the
    structure.

    """

[docs]    def __init__(self, definition, compstr=None):
        """
        Parameters

            definition - a Definition instance, specifying the SMARTS
                pattern(s) to be included and excluded

            compstr - the comparison string to be used.
                      Currently equal to "<operator> <value>"
                      Used by the expand() method

        If compstr are not specified, parseLine() method should be used
        """
        self.type = SMARTS
        Criterion.__init__(self, definition.name, compstr)
        self.definition = definition

    def _count_occurances(self, st):
        """
        Retrieve a count of 'includes' matches in the structure, but do not
        count any matches that have atoms that are also present in any
        'excludes' matches.

        """

        # Get the atomsets for groups that match includes SMARTS pattern.
        # For each group, see if it also matches the excludes patterns.
        # If so, do not count it. Note, that excludes will need to look at
        # not only the atoms matching includes, but surrounding atoms as well.
        # Currently this works correctly only for single definitions, NOT
        # compound definitions (groups of simple definitions)

        inc = self.definition.includes()
        exc = self.definition.excludes()
        try:
            inlist = analyze.evaluate_multiple_smarts(st, inc)
            exlist = analyze.evaluate_multiple_smarts(st, exc)
        except ValueError as err:
            raise RuntimeError(err)

        # Remove duplicate matches (Ev:84352):
        inset = set()
        for atoms in inlist:
            inset.add(tuple(sorted(atoms)))
        exset = set()
        for atoms in exlist:
            exset.add(tuple(sorted(atoms)))

        # Check to see if any inatoms matches any exatoms:
        num_matches = 0
        for inatoms in inset:
            exclude_inset = False
            for exatoms in exset:
                # To remove duplicate matches (Ev:84352):
                inset_in_exset = True
                for atom in inatoms:
                    if atom not in exatoms:
                        inset_in_exset = False
                        break

                if inset_in_exset:
                    exclude_inset = True
                    break  # Get out of exatoms loop

            if not exclude_inset:
                num_matches += 1

        return num_matches

[docs]    def matches(self, st, addprops=False):
        """
        Return True if structure 'st' matches this criterion, False if not.

        Current matching behavior is to count the number of matches in the
        definition.includes() list, that do not have any overlapping atoms
        with matches in the definition.excludes() list.

        st (Structure) - Structure object
        addprops (bool) - whether to add properties for each description
        """
        if not self.compstr:
            raise RuntimeError("matches(): compstr not defined")
        num = self._count_occurances(st)
        if num < 0:
            raise RuntimeError(
                "Structure had more exclude-definition matches than include-definition matches!"
            )
        if addprops:
            propname = 'i_ligfilter_%s' % self.name
            st.property[propname] = num
        return self.match_compstr(num)

[docs]    def getvalue(self, st):
        """
        Return the number of times that definition.includes() patterns match
        the structure but do not overlap with any definition.excludes()
        patterns.

        """
        return self._count_occurances(st)

[docs]    def expand(self, definitions):
        """
        Generate a new SmartsCriterion from the current one in which the
        definition.includes() and definition.excludes() are expanded from
        the definitions list.

        """
        newdefinition = self.definition.expand(definitions)
        return SmartsCriterion(newdefinition, self.compstr)


[docs]class PredefinedCriterion(Criterion):
    """
    A structure matching criterion that acts on the value of a predefined
    function applied to the structure.

    Currently available functions are:
        Num_rings
        Num_aromatic_rings
        Num_aliphatic_rings
        Num_heteroaromatic_rings
        Num_rotatable_bonds
        Num_atoms
        Molecular_weight
        Num_chiral_centers
        Total_charge
        Num_positive_atoms
        Num_negative_atoms

    For example, one definition parseable from the external file is:
        Num_rings == 0

    """

[docs]    def __init__(self, name=None, compstr=None):
        """
        Parameters

            name - the name of the function to use. Allowed values are those
                in ligfilter.PREDEFINED_KEYS.

            compstr - the comparison string to evaluate the result of the
                predefined function against

        """
        self.type = PREDEFINED
        Criterion.__init__(self, name, compstr)
        self._function = None

[docs]    def matches(self, st, addprops=False):
        """
        Return True if structure 'st' matches this criterion, False if not.

        st (Structure) - Structure object
        addprops (bool) - whether to add properties for each description
        """
        # This docstring should be kept in sync with other 'matches'
        # docstrings.
        if not self.compstr:
            raise RuntimeError("matches(): compstr not defined")

        if not self._function:
            self._function = predefined_function_dict[self.name]

        num = self._function(st)
        if addprops:
            if type(num) == type(0):
                propname = 'i_ligfilter_%s' % self.name
            elif type(num) == type(0.0):
                propname = 'r_ligfilter_%s' % self.name
            elif isinstance(num, str):
                # e.g. Molecular_formula
                propname = 's_ligfilter_%s' % self.name
            else:
                raise ValueError("Invalid type: %s" % num)
            st.property[propname] = num
        return self.match_compstr(num)

[docs]    def getvalue(self, st):
        """
        Return the value of the predefined function applied to 'st'.

        For example, return the number of rings, or number of rotatable
        bonds.

        """

        if not self._function:
            self._function = predefined_function_dict[self.name]

        num = self._function(st)
        return num


[docs]class AslCriterion(Criterion):
    """
    This criterion considers a Structure as matching if the stored
    ASL expresson match returns at least one atom.
    """

[docs]    def __init__(self, asl):
        """
        Parameters
            asl - the ASL expression string.
        """
        Criterion.__init__(self, asl)
        self.type = PREDEFINED

[docs]    def matches(self, st, addprops=False):
        """
        Return True if structure 'st' matches this ASL criterion, False if not.

        st (Structure) - Structure object
        addprops (bool) - whether to add properties for each description
        """

        matched = self.getvalue(st)

        if addprops:
            propname = 'b_ligfilter_asl'
            st.property[propname] = matched
        return matched

[docs]    def getvalue(self, st):
        """
        Return True if the structure 'st' matches this ASL. Flase otherwise.
        """

        matched_atoms = analyze.evaluate_asl(st, self.name)
        return bool(len(matched_atoms) > 0)


[docs]class Definition:
    """
    A class that defines a collection of SMARTS patterns for matching
    against. The includes() method returns a list of those patterns that
    should be matched, and the excludes() method returns those that
    shouldn't.
    """

[docs]    def __init__(
            self,
            name,
            includes=[],  # noqa: M511
            excludes=[],  # noqa: M511
            group=None):
        """
        Parameters

            includes - a list of SMARTS patterns to count

            excludes - a list of SMARTS patterns that can be used to exclude
                matches in the includes list
            group - name of the group that this definition is part of
                (optional). See Ev:50599
        """
        self.name = name
        # Make a copy of the lists so that originals don't get modified:
        self._includes = includes[:]
        self._excludes = excludes[:]
        self.group = group

[docs]    def addKey(self, key, positive=True):
        """
        Add the SMARTS pattern 'key' to the list of desired matches
        (includes) if 'positive' is True, and to the list of unwanted
        matches (excludes) if 'positive' is False.

        """
        if positive:
            self._includes.append(key)
        else:
            self._excludes.append(key)

[docs]    def removeKey(self, key):
        """
        Remove the SMARTS pattern 'key' from the wanted or unwanted matches
        list.

        """
        if key in self._includes:
            self._includes.remove(key)
        elif key in self._excludes:
            self._excludes.remove(key)
        else:
            raise RuntimeError("Key '{}' not in definition '{}'".format(
                key, self.name))

[docs]    def includes(self):
        """
        Return a list of wanted matches.
        """
        return self._includes

[docs]    def excludes(self):
        """
        Return a list of unwanted matches.
        """
        return self._excludes

    def _expandIncludes(self, inclist, definitions, masterlist):
        """
        For the provided list of Definitions 'inclist', expand any composite
        definitions, using the contents of the 'definitions' list of
        Definitions.

        Return the expanded definitions via the parameter 'masterlist'.

        For example, if a Definition TwoCarbons in 'definitions' is made up
        of the "includes" [C][C] and [c][c], and 'inclist' includes
        TwoCarbons, then [C][C] and [c][c] will be added to 'masterlist'.

        """
        # TODO: Do we need a guards against circular or contradictory
        # definitions?

        # Modify the masterlist - probably not a good programming practice:
        for inc in inclist:
            if inc in definitions:
                self._expandIncludes(definitions[inc]._includes, definitions,
                                     masterlist)
            else:
                masterlist.append(inc)

    def _expandExcludes(self, exclist, definitions, masterlist):
        """
        For the provided list of Definions 'exclist', expand any composite
        definitions, using the contents of the 'definitions' list of
        Definitions.

        Return the expanded definitions via the parameter 'masterlist'.

        Note that the expansion uses the "includes" of the 'definitions'
        list to generate the masterlist. For example, if a Definition
        TwoCarbons in 'definitions' is made up of the "includes" [C][C] and
        [c][c], and 'exclist' includes TwoCarbons, then [C][C] and [c][c]
        will be added to 'masterlist'.

        """
        # TODO: The behavior described above may not be correct. Should we
        # prohibit excluded definitions that themselves exclude other
        # definitions?
        # TODO: Do we need a guards against circular or contradictory
        # definitions?

        # Modify the masterlist - probably not a good programming practice:
        for exc in exclist:
            if exc in definitions:
                if definitions[exc]._excludes:
                    raise RuntimeError(
                        "Error excluding '%s' -- a definition with excludes (-) cannot be included in another definition"
                        % exc)
                self._expandIncludes(definitions[exc]._includes, definitions,
                                     masterlist)
            else:
                masterlist.append(exc)

[docs]    def expand(self, definitions):
        """
        Generate a new Definition from the current one in which the
        includes and excludes are expanded from the provided 'definitions'
        dictionary.

        """
        newincludes = []
        self._expandIncludes(self._includes, definitions, newincludes)

        newexcludes = []
        self._expandExcludes(self._excludes, definitions, newexcludes)

        return Definition(self.name, newincludes, newexcludes)

    def __str__(self):
        """
        Return a standard string form of the Definition suitable for
        filter/keys files.

        """
        s = []
        s.append("DEFINE %s" % self.name)
        if len(self._includes) == 1 and not self._excludes:
            s.append(self._includes[0])
            return " ".join(s)
        else:
            for i in self._includes:
                s.append("    + %s" % i)
            for e in self._excludes:
                s.append("    - %s" % e)
            return '\n'.join(s)


[docs]class CriterionParser:
    """
    A class for parsing a general property or predefined matching criterion.
    """

[docs]    def __init__(self, definitions_dict=None):
        self.definitions_dict = definitions_dict
        self.line = None
        self.line_num = None

[docs]    def error(self, err):
        """
        Print the error and exit.
        """
        if self.line_num:
            err = 'Line %i: %s' % (self.line_num, err)
        raise RuntimeError(err)

[docs]    def expression_error(self, msg):
        """
        Print an error about an invalid expression and exit.

        """
        self.error("\n  ".join([
            "ERROR: Cound not parse expression due to:", msg,
            "Expression: %s" % self.line,
            "Ligfilter expression needs to be in format: <definition> [<operator> <value>];",
            "where <definition> is a property name or SMARTS definition (no spaces allowed)",
            "Multiple [<operator> <value>] sets must be separated with 'AND' or 'OR'"
        ]))

[docs]    def parse(self, line, line_num=None):
        """
        Create a Criterion object from a string.  The method expects an
        input line of the form

            <name>

            ...or...

            <name> <operator> <value>

        The first form is valid only for property criteria.

        If the instance has a 'definitions_dict', definition criteria will
        be checked against it for validity.

        Returns a Criterion.

        """

        self.line = line
        self.line_num = line_num

        try:
            name = mysplit(line)[0]
        except IndexError:
            msg = 'parse(): Could not parse line: "%s"' % line
            raise RuntimeError(msg)

        criterion = None  # so it goes into this namespace
        if line[:2] in ['b_', 's_', 'i_', 'r_']:  # Property
            #self.error("Only integer or real type properties can be compared.")
            criterion = PropertyCriterion()
        else:
            # Definition (SMARTS) or predefined criterion
            if name in PREDEFINED_KEYS:
                criterion = PredefinedCriterion()
            else:  # Probably SMARTS
                if self.definitions_dict and name not in self.definitions_dict:
                    self.error("Unknown definition or property: %s" % name)
                elif self.definitions_dict:
                    criterion = SmartsCriterion(self.definitions_dict[name])
                else:
                    # Unvalidated criterion.  Should this be allowed at all?
                    criterion = SmartsCriterion(Definition(name))

        try:
            criterion.parseLine(line)
        except RuntimeError as err:
            self.expression_error(str(err))

        return criterion


[docs]class DefinitionParser:
    """
    A class for parsing a (possibly multi-line) specification of a
    Definition.

    """

[docs]    def __init__(self):
        self.lines = None
        self.line_num = None

[docs]    def error(self, err):
        """
        Print an error and exit.

        """
        if self.line_num:
            err = 'Line %i: %s' % err
        print(err)
        raise RuntimeError(err)

[docs]    def parse(self, lines, line_num=None, group=None):
        """
        Return a Definition from a list of lines. No expansion of
        definitions is done.

        General pattern of the specification is

        DEFINE <name> <SMARTS pattern>

        or

        DEFINE <name>
            (+ include_definition)*
            (- exclude_definition)*

        Where the asterisk indicates zero or more of each of the include and
        exclude definitions.

        Options:
            line_num - current line of the file being parsed (for error handling)
            group - name of the definition group (or None, if there is no group)
        """

        firstline = lines[0]
        s = mysplit(firstline)
        if s[0] != 'DEFINE':
            self.error("Definition expression must start with 'DEFINE'")
        if len(s) < 2 or len(s) > 3:
            self.error("Invalid DEFINE statement: %s" % firstline)

        name = s[1]  # The second "word" of the line
        if name in PREDEFINED_KEYS:
            self.error("Cannot redefine built-in definition: %s" % name)
        if name[0:2] in ['s_', 'i_', 'r_', 'b_']:
            self.error("Invalid definition name: %s" % name)
        if len(s) == 3:  # 3 "words" on this line
            # Single-line definition.  Value should be a SMARTS definition.
            return Definition(name, includes=[s[2]], group=group)

        else:
            # Multi-line definition
            definition = Definition(name, group=group)

            for line in lines[1:]:
                line_num += 1
                s = mysplit(line)
                if not s:  # Blank line
                    continue
                if len(s) != 2:  # Have to have 2 "words" on each line
                    self.error("Invalid definition line: %s" % line)

                inc_exc = s[0]
                value = s[1]
                if inc_exc == '+':
                    definition.addKey(value)
                elif inc_exc == '-':
                    definition.addKey(value, False)
                else:
                    self.error("Invalid definition line: %s" % line)
            if not definition.includes():
                self.error("Definition '%s' has no includes" % definition.name)
            return definition


[docs]def read_keys(fh, validate=False, validdefinitions=None):
    """
    Generate lists of Definitions and Criteria from an iterator 'fh'
    that returns a line at a time of the Definition and Criteria
    specification. For example, this iterator can be an open file or a list
    of strings.

    If 'validate' is True, definition names in criteria will be checked
    against known Definitions, including those previously read from 'fh' and
    passed in via 'validdefinitions'.  No expansion of Definitions is done.

    Return a tuple of (Definition list, Criterion list).

    """

    # Make a copy to avoid editing original:
    if validdefinitions:
        myvaliddefinitions = validdefinitions[:]
    else:
        myvaliddefinitions = []

    criteria = []
    definitions = []
    line_num = 0
    currentdefinitionlines = []
    currentdefinitionlinenum = 0
    currgroup = None

    dp = DefinitionParser()

    for line in fh:
        line = line.strip()  # Get rid of leading and trailing spaces/tabs

        line_num += 1
        # Skip "#" lines and lines beginning with "# " (comments):
        # Note: Comments MUST have a space after #. This is implemented
        # to support defenitions that start with a pound.
        if not line or line == "#" or line.startswith('# '):
            continue

        line = line.split('# ')[0]  # Delete trailing comment
        s = mysplit(line)

        if s[0] == "GROUP":
            # The group is everything after the space following the keyword:
            # (spaces are supported)
            currgroup = line[6:]
        elif s[0] == "GROUPEND":
            currgroup = None
        elif s[0] == 'DEFINE':
            if currentdefinitionlines:
                # The definition we were reading is done
                newdefinition = dp.parse(currentdefinitionlines,
                                         currentdefinitionlinenum,
                                         group=currgroup)

                # Start the new definition definition
                definitions.append(newdefinition)
                myvaliddefinitions.append(newdefinition)
            currentdefinitionlines = [line]
            currentdefinitionlinenum = line_num
        elif s[0] == '+':
            currentdefinitionlines.append(line)
        elif s[0] == '-':
            currentdefinitionlines.append(line)
        else:

            # Criterion
            if currentdefinitionlines:
                # The definition we were reading is done.
                newdefinition = dp.parse(currentdefinitionlines,
                                         currentdefinitionlinenum,
                                         group=currgroup)

                definitions.append(newdefinition)
                myvaliddefinitions.append(newdefinition)
                currentdefinitionlines = []
                currentdefinitionlinenum = 0

            if validate:
                definitions_dict = {}
                for d in myvaliddefinitions:
                    definitions_dict[d.name] = d

                cp = CriterionParser(definitions_dict)
                newcriterion = cp.parse(line, line_num)
            else:
                cp = CriterionParser()
                newcriterion = cp.parse(line, line_num)

            criteria.append(newcriterion)

    if currentdefinitionlines:

        # The definition we were reading is done.
        newdefinition = dp.parse(currentdefinitionlines,
                                 currentdefinitionlinenum,
                                 group=currgroup)

        definitions.append(newdefinition)
        myvaliddefinitions.append(newdefinition)

    return (definitions, criteria)


#
# Find and read the default definitions from the Schrodinger installation.
#

try:
    SCHRODINGER = os.environ['SCHRODINGER']
except KeyError:
    raise Exception("SCHRODINGER is not defined.")
if 'MMSHARE_EXEC' in os.environ:
    MMSHARE_EXEC = os.environ['MMSHARE_EXEC']
else:
    # Hunt for mmshare:
    try:
        MMSHARE_EXEC = jobutil.hunt("mmshare")
    except:
        MMSHARE_EXEC = None
    if not MMSHARE_EXEC:
        raise Exception("Could not determine MMSHARE_EXEC.")

# Search for the standard LigFilter definitions file:
# First search in CWD, then in mmshare:

MMSHARE_datadir = os.path.join(MMSHARE_EXEC, "..", "..", "data")
mmshare_data_file = os.path.join(MMSHARE_datadir, DEFAULT_PATTERNS_FILE)

try:
    appdata_dir = mm.mmfile_schrodinger_appdata_dir()
except:
    raise RuntimeError(
        "Could not determine the Schrodinger application data directory.")
appdata_file = os.path.join(appdata_dir, DEFAULT_PATTERNS_FILE)

# Search: 1) CWD, 2) .schrodinger 3) MMSHARE/data:
if os.path.isfile(DEFAULT_PATTERNS_FILE):
    _definition_file_name = DEFAULT_PATTERNS_FILE
    print('Using local copy of %s' % DEFAULT_PATTERNS_FILE)
elif os.path.isfile(appdata_file):
    _definition_file_name = appdata_file
else:
    _definition_file_name = mmshare_data_file

# Open the definition file:
try:
    _definition_fh = open(_definition_file_name)
except:
    raise Exception("Failed to open %s file." % _definition_file_name)

# Read the definitions (and criteria, if present) from the definitions file:
(default_definitions, _criteria) = read_keys(_definition_fh, validate=True)
_definition_fh.close()

if _criteria:
    print("")
    print("WARNING: There are criteria in the default definitions file. They")
    print("         will be ignored.")
    print("")


[docs]def get_default_criterion_parser():
    """
    Returns a CriterionParser with default definitions
    """

    # Get the default definition definitions
    definitions_dict = {}
    for definition in default_definitions:
        definitions_dict[definition.name] = definition

    # Expand the definitions (and criteria) before making the matches
    # It might be necessary only to expand the definitions in the Criterion
    # objects, since the definitions_dict doesn't get used after this.
    newdict = {}
    for dname, d in definitions_dict.items():
        newdict[dname] = d.expand(definitions_dict)
    definitions_dict = newdict

    cp = CriterionParser(definitions_dict)
    return cp


[docs]def generate_criterion(condition, cp=None):
    """
    Ev:55805
    Returns a Criterion object for a specified condition string.
    Condition string may be something like:
    "Num_atoms < 100"

    The returned criterion can be then used as follows:

      if criterion.matches(st):
          <do>

    Optionally a CriterionParser (cp) may be specified;
    otherwise default definitions will be used.
    """

    if cp is None:
        cp = get_default_criterion_parser()
    criterion = cp.parse(condition)
    return criterion


[docs]def st_matches_criteria(st, criteria_list, match_any=False, addprops=False):
    """
    If the specified structure matches the criteria, returns None. If does not
    match, then a string is returned, explaining the reason.

    match_any - if True, st is considers to match if it matches at least
                one criteria; otherwise all criteria must be matched.

    addprops - if True, properties for each descriptor is added to st.

    """

    # Sort criteria by order of execution (fast execution first):
    sorted_criteria = []
    for c in criteria_list:
        if c.type == PROPERTY:
            sorted_criteria.append(c)
    for c in criteria_list:
        if c.type == SMARTS:
            sorted_criteria.append(c)
    for c in criteria_list:
        if c.type == PREDEFINED:
            sorted_criteria.append(c)
    for c in criteria_list:
        if c.type == ASL:
            sorted_criteria.append(c)

    for c in sorted_criteria:
        match = c.matches(st, addprops=addprops)
        if match and match_any:
            return None
        if not match and not match_any:
            return "did not match criteria: %s" % c.name

    if match_any:
        # No matches found
        return "did not match any createria"
    else:
        # No non-matches found
        return None

    #EOF