Source code for schrodinger.application.phase.packages.mmp2d

"""
This module provides functions that perform various types of queries on a
matched molecular pairs 2D database.
"""

import math
import sqlite3
from collections import defaultdict
from collections import namedtuple

# Keys for MMP transformation dictionaries:
TRANS_KEYS = namedtuple("TransKeys",
                        "FROM_SMILES TO_SMILES MIN MAX AVG STD COUNT")(
                            "from_smiles", "to_smiles", "min", "max", "avg",
                            "std", "count")

# Statistical properties extracted from the rule_environment_statistics table:
ENV_STATS = [
    TRANS_KEYS.MIN, TRANS_KEYS.MAX, TRANS_KEYS.AVG, TRANS_KEYS.STD,
    TRANS_KEYS.COUNT
]


[docs]class MMPDatabaseConnection: """ Context manager for a 2D MMP SQLite database connection. """
[docs] def __init__(self, db_path): self.db_path = db_path self.connection = None
def __enter__(self): self.connection = sqlite3.connect(self.db_path) return self def __exit__(self, *args): self.connection.close()
[docs]def get_activity_cliffs(mmp2d_db_path, property_name, property_diff): """ Given an MMP 2D SQLite database, this function returns all pairs of compound ids from the pair table for which the absolute difference in the indicated property is greater than or equal to property_diff. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :property_name: Name of the property for activity cliffs :type property_name: str :property_diff: Threshold on absolute property difference :type property_diff: float :return: Pairs of compound ids that exhibit an activity cliff :rtype: list[(int, int)] """ prop_values = get_property_values(mmp2d_db_path, property_name) unique_pairs = set() select = "SELECT compound1_id, compound2_id from pair" with MMPDatabaseConnection(mmp2d_db_path) as db: cursor = db.connection.cursor() for id1, id2 in cursor.execute(select): try: diff = math.fabs(prop_values[id1] - prop_values[id2]) if diff >= property_diff: unique_pairs.add((min(id1, id2), max(id1, id2))) except KeyError: pass return sorted(unique_pairs)
[docs]def get_compound_smiles(mmp2d_db_path): """ Returns the compound id --> SMILES dictionary created from the compound table of the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :return: compound id --> SMILES dictionary :rtype: dict{int: str} """ select_stmt = "SELECT id, input_smiles FROM compound" return select_dictionary(mmp2d_db_path, select_stmt)
[docs]def get_ids_from_pairs(compound_pairs): """ Returns all unique compound ids in the provided pairs of compound ids. :param compound_pairs: Pairs of compound ids :type compound_pairs: list[(int, int)] :return: Unique compound ids in increasing order :rtype: list[int] """ unique_ids = set() for compound_pair in compound_pairs: unique_ids.update(compound_pair) return sorted(unique_ids)
[docs]def get_pair_rule_env_ids(mmp2d_db_path, compound_pairs): """ Returns a dictionary that maps each of the provided pairs of compound ids to a sorted list of rule environment ids, which are primary keys in the rule_environment table. For each pair (id1, id2), both the input order and the reverse order (id2, id1) are considered and are entered into the dictionary if rule environment ids are found for the given order. Thus, both orders should be checked when using the returned dictionary, and if a given pair must be reversed to find a match in the dictionary, the order-sensitive data in the transformation must be reversed/negated. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :param compound_pairs: Pairs of compound ids :type compound_pairs: set((int, int)) :return: (compound1_id, compound2_id) --> rule environment ids :rtype: dict{(int, int): list[int]} """ pair_ids_dict = defaultdict(set) select = "SELECT compound1_id, compound2_id, rule_environment_id FROM pair" with MMPDatabaseConnection(mmp2d_db_path) as db: cursor = db.connection.cursor() for cmpd_id1, cmpd_id2, rule_env_id in cursor.execute(select): original = (cmpd_id1, cmpd_id2) # The true order in the database reverse = (cmpd_id2, cmpd_id1) if original in compound_pairs or reverse in compound_pairs: # Store only the true order. pair_ids_dict[original].add(rule_env_id) # Sort the rule environment ids and convert to lists. return {key: sorted(value) for key, value in pair_ids_dict.items()}
[docs]def get_property_names(mmp2d_db_path): """ Returns the property name --> property id dictionary created from the property_name table of the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :return: property name --> property id dictionary :rtype: dict{str: int} """ select_stmt = "SELECT name, id FROM property_name" return select_dictionary(mmp2d_db_path, select_stmt)
[docs]def get_property_values(mmp2d_db_path, prop_name): """ Returns the compound id --> property value dictionary created from the compound_property table of the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :param property_name: Property name whose values are desired :type property_name: str :return: compound id --> property value dictionary :rtype: dict{int: float} """ property_id = get_property_names(mmp2d_db_path)[prop_name] select_stmt = "SELECT compound_id, value from compound_property where " + \ "property_name_id = %d ORDER by compound_id" % property_id return select_dictionary(mmp2d_db_path, select_stmt)
[docs]def get_public_ids(mmp2d_db_path): """ Returns the compound id --> public id dictionary created from the compound table of the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :return: property id --> public id dictionary :rtype: dict{int: str} """ select_stmt = "SELECT id, public_id FROM compound" return select_dictionary(mmp2d_db_path, select_stmt)
[docs]def get_rule_env_rule_id(mmp2d_db_path): """ Returns the rule_environment id --> rule_id dictionary created from the rule_environment table of the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :return: rule environment id --> rule id dictionary :rtype: dict{int: int} """ select_stmt = "SELECT id, rule_id FROM rule_environment" return select_dictionary(mmp2d_db_path, select_stmt)
[docs]def get_rule_env_stats(mmp2d_db_path): """ Returns the rule_environment id --> [min, max, avg, std, count] dictionary for the corresponding statistics in the rule_environment_statistics table. If a particular statistic is NULL (e.g., std when count is 1), a value of None is returned for that statistic. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :return: rule environment id --> statistics dictionary :rtype: dict{int: [float, float, float, float, int]} """ select_stmt = "SELECT rule_environment_id, min, max, avg, std, count " + \ "FROM rule_environment_statistics" with MMPDatabaseConnection(mmp2d_db_path) as db: cursor = db.connection.cursor() return { key: [x1, x2, x3, x4, x5] for key, x1, x2, x3, x4, x5 in cursor.execute(select_stmt) }
[docs]def get_rule_id_rule_smiles(mmp2d_db_path): """ Returns the rule_id --> [from_smiles, to_smiles] dictionary created from the rule and rule_smiles tables of the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :return: rule_id --> [from_smiles, to_smiles] dictionary :rtype: dict{int: [str, str]} """ select_stmt = "SELECT id, smiles FROM rule_smiles" rule_smiles_id_to_smiles = select_dictionary(mmp2d_db_path, select_stmt) rule_id_to_smiles_pair = {} select_stmt = "SELECT id, from_smiles_id, to_smiles_id FROM rule" with MMPDatabaseConnection(mmp2d_db_path) as db: cursor = db.connection.cursor() for rule_id, from_id, to_id in cursor.execute(select_stmt): try: from_smiles = rule_smiles_id_to_smiles[from_id] to_smiles = rule_smiles_id_to_smiles[to_id] rule_id_to_smiles_pair[rule_id] = [from_smiles, to_smiles] except KeyError: pass return rule_id_to_smiles_pair
[docs]def get_transformations(compound_pair, pair_to_rule_env_ids, env_id_to_rule_id, rule_id_to_rule_smiles, env_id_to_stats): """ Given a pair of compound ids and various dictionaries created from the database tables, this function returns a list of dictionaries that hold data for one or more MMP transformations that relate the first compound to the second compound. Each dictionary in the returned list contains the following key, value pairs: Key Value --- ----- TRANS_KEYS.FROM_SMILES MMP fragment SMIRKS for the first compound (str) TRANS_KEYS.TO_SMILES MMP fragment SMIRKS for the second compound (str) TRANS_KEYS.MIN The min statistic for the transformation (float) TRANS_KEYS.MAX The max statistic for the transformation (float) TRANS_KEYS.AVG The avg statistic for the transofrmation (float) TRANS_KEYS.STD The std statistic for the transformation (float) TRANS_KEYS.COUNT The count statistic for the transformation (int) :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :param compound_pair: A pair of compound ids from the compound table. The input order and reverse order of the ids are considered, with reversal of from_smiles and to_smiles and negation of min, max and avg in the latter case. :type compound_pair: (int, int) :param pair_to_rule_env_ids: Dictionary that maps compound id pairs to to rule environment ids. See get_pair_rule_env_ids. :type pair_to_rule_env_ids: dict{(int, int): list[int]} :param env_id_to_rule_id: Dictionary that maps rule_environment id to rule_id. See get_rule_env_rule_id. :type env_id_to_rule_id: dict{int: int} :param rule_id_to_rule_smiles: Dictionary that maps rule_id to the (from_smiles, to_smiles) pair that defines the transformation. See get_rule_id_rule_smiles. :type rule_id_to_rule_smiles: dict{int: (str, str)} :param env_id_to_stats: Dictionary that maps rule_environment id to the tuple (min, max, avg, std, count). See get_rule_env_stats. :type env_id_to_stats: dict{int: (float, float, float, float, int)} :return: List of transformation dictionaries :rtype: list[dict{str: str/str/float/float/float/float/int}] :raises: KeyError if neither the input compound_pair nor its reverse is found in pair_to_rule_env_ids """ transformations = [] reverse_pair = (compound_pair[1], compound_pair[0]) from_smiles_pos = 0 to_smiles_pos = 1 stat_signs = 5 * [1] reverse = False for pair in compound_pair, reverse_pair: if pair in pair_to_rule_env_ids: # Don't consider the same pair/rule_id combo more than once. pair_rule_set = set() for env_id in pair_to_rule_env_ids[pair]: rule_id = env_id_to_rule_id[env_id] pair_rule = (pair, rule_id) if pair_rule in pair_rule_set: continue pair_rule_set.add(pair_rule) smiles_pair = rule_id_to_rule_smiles[rule_id] row = { TRANS_KEYS.FROM_SMILES: smiles_pair[from_smiles_pos], TRANS_KEYS.TO_SMILES: smiles_pair[to_smiles_pos] } stats = env_id_to_stats[env_id] for key, value, stat_sign in zip(ENV_STATS, stats, stat_signs): if value is not None: row[key] = value * stat_sign else: row[key] = value if reverse: # Account for the fact that reversal of sign will flip # min and max roles. new_min = row[TRANS_KEYS.MAX] new_max = row[TRANS_KEYS.MIN] row[TRANS_KEYS.MIN] = new_min row[TRANS_KEYS.MAX] = new_max transformations.append(row) # Reverse order of smiles and applicable stat_signs for reverse_pair. from_smiles_pos = 1 to_smiles_pos = 0 stat_signs = [-1, -1, -1, 1, 1] reverse = True if not transformations: mesg = f"Neither {compound_pair} nor {reverse_pair} found in " + \ "pair_to_rule_env_ids" raise KeyError(mesg) return transformations
[docs]def select_dictionary(mmp2d_db_path, select_stmt): """ Executes a select statement on the supplied MMP 2D SQLite database and returns the results as a dictionary. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :param select_stmt: SELECT statement containing two fields :type select_stmt: str :return: field1 --> field2 dictionary :rtype: dict{type(field1): type(field2)} """ with MMPDatabaseConnection(mmp2d_db_path) as db: cursor = db.connection.cursor() return {key: value for key, value in cursor.execute(select_stmt)}
[docs]def table_exists(mmp2d_db_path, table_name): """ Returns true if the specified table exists in the supplied MMP 2D SQLite database. :param mmp2d_db_path: Path to MMP 2D database :type mmp2d_db_path: str :param table_name: Table whose existence is sought :type table_name: str :return: Whether the table exists :rtype: bool """ select = "SELECT name FROM sqlite_master WHERE type='table'" with MMPDatabaseConnection(mmp2d_db_path) as db: cursor = db.connection.cursor() if (table_name,) in cursor.execute(select).fetchall(): return True return False