Source code for schrodinger.application.phase.shape_screen_reporter.option_utils

"""
Performs parsing and validation of options for generating and using shape
screen report databases.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import argparse
import enum
import os

from schrodinger.application.phase.shape_screen_reporter import prop_utils
from schrodinger.infra import phase
from schrodinger.structutils import filter
from schrodinger.utils import cmdline
from schrodinger.utils import fileutils

SHAPE_SCREEN_REPORTER = "shape_screen_reporter"
ClusterBy = enum.Enum('ClusterBy', 'features scaffolds')
IMAGE_QUALITY = ["low", "medium", "high"]
IMAGE_HEIGHT = {"low": 400, "medium": 800, "high": 1200}
DEFAULT_COLUMNS = 3
MAX_IMAGES_TO_PRINT = 150
IMAGES_PER_PAGE = 15
PageSize = enum.Enum('PageSize', 'Letter A4')
PAGE_SIZES = {
    PageSize.Letter.name: PageSize.Letter,
    PageSize.A4.name: PageSize.A4
}

FILTERED_HITS_FILE_TYPES = [
    fileutils.MAESTRO, fileutils.SD, fileutils.SMILESCSV
]


[docs]def add_cluster_args(parser): """ Adds arguments for the "cluster" task. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ parser.add_argument( '-by', choices=[mode.name for mode in ClusterBy], default=ClusterBy.features.name, help='Whether to cluster hits by the query features they match or the ' 'largest Bemis-Murcko scaffold they contain (default: %(default)s).') parser.add_argument( '-hits', metavar='<hits_file>', help='Output Maestro or SD file for clustered hits. If omitted, a ' 'textual summary of the clusters is printed but no hits are written.') parser.add_argument( '-min_hits', metavar='<m>', type=int, default=5, help='Minimum number of hits per cluster. This option is used in lieu ' 'of enforcing a maximum number of features per cluster or a maximum ' 'scaffold size (default: %(default)d).') cluster_features_args = parser.add_argument_group( 'Cluster "-by features" Options') cluster_features_args.add_argument( '-hypo', metavar='<prefix>', help='Create a pharmacophore hypothesis to represent the features ' 'matched in each cluster. For example, a cluster matching features ' 'A2, D6, H9 and R12 would yield <prefix>_A2_D6_H9_R12.phypo. ' 'Hypotheses are returned in the zip archive <jobname>_hypos.zip.') cluster_features_args.add_argument( '-min_features', metavar='<n>', type=int, help='Minimum number of pharmacophore features a hit must match in ' 'order to be assigned to a cluster (default: 3).') cluster_features_args.add_argument( '-tol', metavar='<dict>', help='A dictionary-like specification of matching tolerances. For ' 'example, "N:1.5,P:1.5,H:2.5" overrides the default tolerances for all ' 'features of type N, P and H. Any feature type not specified through ' 'this option is assigned a tolerance of 2.0.') cluster_features_args.add_argument( '-rules', metavar='<dict>', help='A dictionary-like specification of feature type matching rules. ' 'For example, "A:AN,H:HR" allows all features of type A to match A or ' 'N and all features of type H to match H or R. Any feature type not ' 'specified through this option is allowed to match only features of ' 'the same type.') cluster_scaffolds_args = parser.add_argument_group( 'Cluster "-by scaffolds" Options') cluster_scaffolds_args.add_argument( '-fuzzy', action='store_true', help='Use fuzzy scaffolds, where atoms are distinguished only by ' 'whether they are aliphatic or aromatic.') cluster_scaffolds_args.add_argument( '-group_missing', action='store_true', help='Group hits for which no scaffold was found (e.g., acyclic and ' 'fragmented structures) into a single cluster. The default is to ' 'exclude those hits.')
[docs]def add_create_args(parser): """ Adds arguments for the "create" task. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ parser.add_argument( '-hits', metavar='<hits_file>', required=True, help='Maestro or SD file containing hits from a shape screen, a ' 'pharmacophore screen or a Glide docking job. This option is required.') parser.add_argument( '-query', metavar='<query_file>', help='Maestro or SD file containing the shape query or other suitable ' 'structure to which hits are aligned. In the case of a pharmacophore ' 'screen, the query should be the reference ligand for the hypothesis. ' 'In the case of a Glide docking job, a suitable query might be the ' 'bioactive pose of the native ligand. If omitted, the first suitable ' 'entry in <hits_file> is used as the shape query. This is the first ' 'ligand-size structure or the reference ligand from the first ' 'hypothesis.') parser.add_argument( '-fd', metavar='<fd_file>', help='Use pharmacophore feature definitions in the supplied file ' 'rather than the default definitions in the software distribution.') parser.add_argument( '-props', metavar='<list>', help='Comma-separated list of properties in the hits file that should ' 'be stored in the database to allow application of property filters ' '(see "filter" task). A set of physicochemical properties is computed ' 'and stored by default, so this list would typically consist of other ' 'computed or experimental properties that are of critical interest. ' 'Property names must be of the form <t>_<source>_<name>, where <t> is ' 'the property type ("b", "i", "r", "s"), <source> indicates the origin ' 'of the property ("autoqsar", "phase", "user", etc.), and <name> is a ' 'descriptive name ("Pred_Class", "Fitness", "pIC50", etc.).') parser.add_argument( '-epik', action='store_true', help=f'Run Epik on the hits so that {prop_utils.ION_CLASS_VSDB} ' '("Acidic", "Weakly Acidic", etc.) can be stored. This increases ' 'database creation time by about a factor of 50, so use of -HOST with ' 'multiple CPUs is strongly encouraged. Note that hits must be ' 'provided in Maestro format when using this option, and that the most ' 'probable neutral tautomer is retained, which may affect the values ' 'of other calculated properties.') parser.add_argument('-overwrite', action='store_true', help='Overwrite existing shape screen report database.')
[docs]def add_export_args(parser): """ Adds arguments for the "export" task. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ add_feature_matching_args(parser, features_required=True) parser.add_argument( '-hits', metavar='<hits_file>', help='Output Maestro or SD file for hits. Hits are not written if this ' 'option is omitted.') parser.add_argument( '-hypo', metavar='<file>.phypo', help='Output hypothesis file. Hypothesis is not created if this option ' 'is omitted.')
[docs]def add_feature_matching_args(parser, features_required=False): """ Adds arguments for specifying a list of features to match, positional tolerances and feature type matching rules. :param parser: Argument parser object. :type parser: argparse.ArgumentParser :param features_required: If True, the help message will indicate that the list of features is required. """ if features_required: required_mesg = ' This option is required.' else: required_mesg = (' If omitted, a match to at least 3 features in the ' 'query is required.') parser.add_argument( '-features', metavar='<list>', required=features_required, help='Comma-delimited list of query features to match, e.g., ' f'"A2,D6,H9,R12".{required_mesg} Structures that provide a match ' 'are output in order of decreasing shape similarity to the query.') parser.add_argument( '-tol', metavar='<dict>', help='A dictionary-like specification of matching tolerances. For ' 'example, "A2:1.5,H9:2.5" overrides the default tolerances for ' 'features A2 and H9. All other features would be assigned a ' 'tolerance of 2.0.') parser.add_argument( '-rules', metavar='<dict>', help='A dictionary-like specification of feature type matching rules. ' 'For example, "A2:AN,H9:HR" means feature A2 is allowed to match ' 'features of type A or N and feature H9 is allowed to match features ' 'of type H or R. All other features would be allowed to match only ' 'features of the same type.')
[docs]def add_filter_args(parser): """ Adds arguments for the "filter" task. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ parser.add_argument( '-props', metavar='<filename>', help='File containing one or more property filters a hit must satisfy ' 'to survive. Each line must be of the form <property>,<operator>,' '<value1>[,<value2>...], where <property> is the name of a property (' 'e.g., i_vsdb_HBA, r_vsdb_AlogP), <operator> is a supported SQL ' 'logical operator ("=", "<", ">", "<=", ">=", "<>", "BETWEEN", "LIKE", ' '"IN") and the remainder of the line contains the appropriate number ' 'of target values. Must be exactly 2 values for BETWEEN, 1 or more ' 'values for IN and exactly 1 value for all other operators. Use ' '-list_props to get a list of available properties.') parser.add_argument( '-smarts', metavar='<filename>', help='File containing one or more SMARTS filters a hit must satisfy ' 'to survive. Each line must be of the form <smarts>,<min>,<max>[,' '<name>], where <smarts> is a SMARTS string, <min> is the minimum ' 'required number of occurences of the associated substructure, <max> ' 'is the maximum allowed number of occurences and <name> is an ' 'optional name for the filter.') parser.add_argument( '-diverse', metavar='<fraction>', type=float, help='Select a diverse fraction of hits from all rows in the report ' 'database or after applying other filters. The supplied fraction must ' 'lie on the interval (0.0, 0.2].') parser.add_argument( '-hits', metavar='<hits_file>', help='Output Maestro, SD or CSV file for hits that satisfy all ' 'filters (default: <jobname>.csv).') parser.add_argument( '-all_props', action='store_true', help='If writing hits in CSV format, include all properties present ' 'in the hit structures. The default is to write only the properties ' 'that are available for filtering.') parser.add_argument( '-list_props', action='store_true', help='Print the list of properties that are available for filtering ' 'and exit.') parser.add_argument( '-dp', metavar='<delim>', default=',', help="Property filter file delimiter. For tab, use -dp tab (default: " "',').") parser.add_argument( '-ds', metavar='<delim>', default=',', help="SMARTS filter file delimiter. For tab, use -ds tab (default: " "',').")
[docs]def add_print_args(parser): """ Adds arguments for the "print" task. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ add_feature_matching_args(parser, features_required=False) parser.add_argument( '-pdf', metavar='<pdf_file>', help='Output PDF file for table. If omitted, the list of features ' 'to match determines the file name, e.g., A2_D6_H9_R12.pdf.') parser.add_argument('-quality', choices=IMAGE_QUALITY, default='medium', help='Resolution of 2D images (default: medium).') parser.add_argument( '-cols', metavar='<n>', type=int, default=DEFAULT_COLUMNS, help='Number of columns in the table (default: %(default)d).') npages = round(MAX_IMAGES_TO_PRINT / IMAGES_PER_PAGE) parser.add_argument( '-limit', metavar='<m>', type=int, default=MAX_IMAGES_TO_PRINT, help='Maximum number of 2D images to print. The default is ' f'{MAX_IMAGES_TO_PRINT}, which produces {npages} pages of output with ' 'the default number of columns.') parser.add_argument('-size', choices=[PageSize.Letter.name, PageSize.A4.name], default=PageSize.Letter.name, help='Page size (default: %(default)s).')
[docs]def check_feature_string(s, expect_number=False): """ Returns a non-empty error message if s is not a valid pharmacophore feature specification. :param s: Pharmacophore feature string :type s: str :param expect_number: Whether to expect a feature number at the end of s :type expect_number: bool :return: Non-empty error message if s is invalid :rtype: str """ if not s: return 'Feature string is empty' if s[0] not in phase.LEGAL_BASE_FEATURE_TYPES: return f'Illegal feature type: "{s[0]}"' if expect_number: if len(s) == 1: return f'Missing number in feature string "{s}"' elif not is_positive_int(s[1:]): return f'Illegal feature number: "{s[1:]}"' elif len(s) > 1: c = 'character' if len(s) == 2 else 'characters' return f'Unexpected {c} "{s[1:]}" in feature type string "{s}"' return ''
[docs]def check_permitted_features_string(s): """ Returns a non-empty error message if s contains any illegal characters. :param s: String of permitted feature types :type s: str :return: Non-empty error message if s is invalid :rtype: str """ for c in s: if c not in phase.LEGAL_BASE_FEATURE_TYPES: return f'Illegal character "{c}" in permitted features string "{s}"' return ''
[docs]def get_parser(): """ Creates argparse.ArgumentParser with supported command line options. :return: Argument parser object :rtype: argparse.ArgumentParser """ parser = argparse.ArgumentParser( prog=SHAPE_SCREEN_REPORTER, formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers( dest='task', metavar='<task>', help='The task to perform. For detailed help on a specific task, use ' f'{SHAPE_SCREEN_REPORTER} <task> -h.') parser_create = subparsers.add_parser( 'create', help='Create a new shape screen report database.') add_create_args(parser_create) parser_filter = subparsers.add_parser( 'filter', help='Filter hits based on properties, SMARTS or diversity.') add_filter_args(parser_filter) parser_cluster = subparsers.add_parser( 'cluster', help='Cluster hits according to the query features they match or the ' 'largest Bemis-Murcko scaffold they contain.') add_cluster_args(parser_cluster) parser_print = subparsers.add_parser( 'print', help='Generate a PDF file containing a table of annotated 2D hits that ' 'yield an in-place match to a subset of features in the query.') add_print_args(parser_print) # Add a visual separator between help for last task and <report>.vsdb. dashes = '----------------------------------------------------------' parser_export = subparsers.add_parser( 'export', help='Export 3D hits that yield an in-place match to a subset of ' 'features in the query and/or export a pharmacophore hypothesis ' f'created from those features. {dashes}') add_export_args(parser_export) parser.add_argument( 'vsdb', metavar='<report>.vsdb', help='Shape screen report database. Must already exist for all tasks ' 'except "create". The default job name for a given task is ' '<report>_<task>.') jobcontrol_options = [cmdline.HOST, cmdline.JOBNAME, cmdline.TMPDIR] cmdline.add_jobcontrol_options(parser, options=jobcontrol_options) return parser
[docs]def is_mae_or_sd(filename): """ Returns True if filename corresponds to a Maestro or SD file. :param filename: File name :type filename: str :return: True if filename is a Maestro or SD file :rtype: bool """ file_format = phase.get_phase_file_format(filename) return file_format in [ phase.PhpFileFormat_PHP_FORMAT_MAE, phase.PhpFileFormat_PHP_FORMAT_SD ]
[docs]def is_positive_int(value): """ Returns True if the provided string can be cast to a positive int. :param value: The string to be tested :type value: str :return: Whether value is a positive int (1, 2, etc.) :rtype: bool """ try: return int(value) > 0 except ValueError: return False
[docs]def is_positive_numeric(value): """ Returns True if the provided string can be cast to a float value > 0. :param value: The string to be tested :type value: str :return: Whether value is a positive numeric :rtype: bool """ try: return float(value) > 0.0 except ValueError: return False return True
[docs]def parse_feature_rules(rules_string, expect_number=False): """ Given a comma-delimited string of <feature>:<permitted> pairs, where <feature> may or may not contain a number, and where <permitted> is a string of feature types that <feature> is allowed to match, this function attempts to translate the string into a dictionary and reports any syntax errors it finds. :param rules_string: Comma-delimited string of <feature>:<permitted> pairs :type rules_string: str :param expect_number: Whether to expect a feature number at the end of each feature string :type expect_number: bool :return: tuple of <feature>:<permitted> dictionary and non-empty error message if rules_string is illegal :rtype: dict{str: float}, str """ if not rules_string: return {}, '' feature_rules_dict = {} for token in rules_string.split(','): values = token.split(':') if len(values) != 2: mesg = f'Illegal <feature>:<permitted> specification: "{token}"' return {}, mesg feature = values[0].strip() mesg = check_feature_string(feature, expect_number) if mesg: return {}, mesg permitted = values[1].strip() mesg = check_permitted_features_string(permitted) if mesg: return {}, mesg feature_rules_dict[feature] = sanitize_permitted(feature, permitted) return feature_rules_dict, ''
[docs]def parse_feature_tol(tol_string, expect_number=False): """ Given a comma-delimited string of <feature>:<tol> pairs, where <feature> may or may not contain a number, this function attempts to translate the string into a dictionary and reports any syntax errors it finds. :param tol_string: Comma-delimited string of <feature>:<tol> pairs :type tol_string: str :param expect_number: Whether to expect a feature number at the end of each feature string :type expect_number: bool :return: tuple of <feature>:<tol> dictionary and non-empty error message if tol_string is illegal :rtype: dict{str: float}, float """ if not tol_string: return {}, '' feature_tol_dict = {} for token in tol_string.split(','): values = token.split(':') if len(values) != 2: mesg = f'Illegal <feature>:<tol> specification: "{token}"' return {}, mesg feature = values[0].strip() mesg = check_feature_string(feature, expect_number) if mesg: return {}, mesg tol = values[1].strip() if not is_positive_numeric(tol): return {}, f'Illegal feature tolerance: "{tol}"' feature_tol_dict[feature] = float(tol) return feature_tol_dict, ''
[docs]def sanitize_permitted(feature, permitted): """ Constructs a permitted features string that conforms to the format Phase expects, which is the type of the feature itself, followed by other types it's allowed to match. For example, if feature is 'R' and permitted is 'HR', the sanitized permitted features string would be 'RH'. :param feature: Feature type, with or without a trailing number :type feature: str :param permitted: String of types feature is allowed to match :type permitted: str :return: Sanitized version of permitted :rtype: str """ unique_types = sorted(set(permitted)) sanitized = [feature[0]] for feature_type in unique_types: if feature_type != sanitized[0]: sanitized.append(feature_type) return ''.join(sanitized)
[docs]def validate_cluster_args(args): """ Checks the validity of arguments for "cluster" task. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ if args.hits and not is_mae_or_sd(args.hits): return False, 'Hits must be written to a Maestro or SD file' if args.min_hits < 1: return False, 'Minimum number of hits per cluster must be >= 1' # Check for illegal combinations with -by features|scaffolds. args_dict = vars(args) if args.by == ClusterBy.features.name: legal_scaffold_args = {'fuzzy': False, 'group_missing': False} for key, value in legal_scaffold_args.items(): if args_dict[key] != value: return False, f'-{key} is not legal with -by features' if args.min_features is None: args.min_features = 3 elif args.min_features < 1: return False, 'Minimum number of features to match must be >= 1' else: legal_feature_args = { 'hypo': None, 'min_features': None, 'tol': None, 'rules': None } for key, value in legal_feature_args.items(): if args_dict[key] != value: return False, f'-{key} is not legal with -by scaffolds' feature_tol_dict, mesg = parse_feature_tol(args.tol) if mesg: return False, mesg feature_rules_dict, mesg = parse_feature_rules(args.rules) if mesg: return False, mesg return True, ''
[docs]def validate_create_args(args): """ Checks the validity of arguments for "create" task. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ if not is_mae_or_sd(args.hits): return False, 'Hits must be provided in a Maestro or SD file' if not os.path.isfile(args.hits): return False, f'Hits file "{args.hits}" not found' if args.query: if not is_mae_or_sd(args.query): return False, 'Shape query must be provided in a Maestro or SD file' if not os.path.isfile(args.query): return False, f'Shape query file "{args.query}" not found' if args.fd: if not os.path.isfile(args.fd): return False, f'Feature definitions file "{args.fd}" not found' if args.props: trim = True props = phase.tokenizeDelimitedString(args.props, ',', trim) # Must be m2io-style property names. for prop in props: try: column_type = phase.PhpDbPropertyTable.getColumnType(prop) except phase.PhpException as exc: return False, str(exc) if args.epik: hit_format = phase.get_phase_file_format(args.hits) if hit_format == phase.PhpFileFormat_PHP_FORMAT_SD: return False, 'Hits must be in Maestro format when -epik is used' return True, ''
[docs]def validate_export_args(args): """ Checks the validity of arguments for "export" task. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ args_ok, mesg = validate_feature_matching_args(args) if not args_ok: return False, mesg if args.hits: if not is_mae_or_sd(args.hits): return False, 'Hits must be written to a Maestro or SD file' if args.hypo: hypo_ext = phase.PHASE_HYPO_FILE_EXT if not args.hypo.endswith(hypo_ext): return False, f'Hypothesis file must end with "{hypo_ext}"' if not args.hits and not args.hypo: return False, 'Must supply at least one of the following: -hits, -hypo' return True, ''
[docs]def validate_feature_matching_args(args): """ Checks validity of args -features <list>, -tol <dict> and -rules <dict>, where the features in the <dict> specifications are expected to include feature numbers. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ if args.features: features = args.features.split(',') for feature in features: mesg = check_feature_string(feature, True) if mesg: return False, mesg unique_features = set(features) if len(unique_features) < len(features): return False, 'Duplicate features supplied' if args.tol: feature_tol_dict, mesg = parse_feature_tol(args.tol, True) if mesg: return False, mesg if args.features: for feature, tol in feature_tol_dict.items(): if feature not in unique_features: mesg = (f'Feature "{feature}" is not in list ' f'"{args.features}"') return False, mesg if args.rules: feature_rules_dict, mesg = parse_feature_rules(args.rules, True) if mesg: return False, mesg if args.features: for feature, permitted in feature_rules_dict.items(): if feature not in unique_features: mesg = (f'Feature "{feature}" is not in list ' f'"{args.features}"') return False, mesg return True, ''
[docs]def validate_filter_args(args): """ Checks the validity of arguments for "filter" task. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ if not (args.props or args.smarts or args.diverse or args.list_props): msg = ('At least one of the following is required: -props, -smarts, ' '-diverse, -list_props') return False, msg if args.props: if not os.path.isfile(args.props): return False, f'Property filter file "{args.props}" not found' try: prop_utils.read_property_filters(args.props, args.dp) except ValueError as exc: return False, str(exc) if args.smarts: if not os.path.isfile(args.smarts): return False, f'SMARTS filter file "{args.smarts}" not found' try: filter.SmartsFilter(filename=args.smarts, delimiter=args.ds) except ValueError as exc: return False, str(exc) if args.diverse is not None and (args.diverse <= 0 or args.diverse > 0.2): return False, 'Diverse fraction must lie on the interval (0.0, 0.2]' if args.hits: ftype = fileutils.get_structure_file_format(args.hits) if ftype not in FILTERED_HITS_FILE_TYPES: return False, f'Illegal filtered hits file type: "{args.hits}"' return True, ''
[docs]def validate_print_args(args): """ Checks the validity of arguments for "print" task. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ args_ok, mesg = validate_feature_matching_args(args) if not args_ok: return False, mesg if args.pdf: if not args.pdf.endswith(".pdf"): return False, 'PDF file extension must be ".pdf"' if args.cols < 1: return False, 'Number of columns in table must be 1 or greater' if args.limit < 1: return False, 'Maximum number of 2D images must be 1 or greater' return True, ''
VALIDATE_TASK_DICT = { 'create': validate_create_args, 'filter': validate_filter_args, 'cluster': validate_cluster_args, 'print': validate_print_args, 'export': validate_export_args }
[docs]def validate_args(args): """ Checks the validity of command line arguments. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ vsdb_ext = phase.PHASE_VSDB_FILE_EXT if not args.vsdb.endswith(vsdb_ext): return False, f'Shape screen report database must end with "{vsdb_ext}"' vsdb_exists = os.path.isfile(args.vsdb) if args.task == 'create': if vsdb_exists and not args.overwrite: mesg = f'{args.vsdb} exists. Remove first or use -overwrite' return False, mesg elif not vsdb_exists: return False, f'Shape screen report database "{args.vsdb}" not found' return VALIDATE_TASK_DICT[args.task](args)