Source code for schrodinger.application.fpsim

"""
This module provides an API for performing GPU-based fingerprint similarity
queries against compound databases on FPsim servers. Two variants of the API are
provided: a query function and the FPsimSearcher class.

The module-level query function can be used to query either all or a specific
subset of the public databses available on a single FPsim server.

The FPsimSearcher class allows multiple servers to be specified, each with
either all public databases or a specific subset. The entire collection of
databases may then be queried in a single call.

The server url should be the base url without the endpoint extension (e.g.
http://fpsim.foo.com/, *not* http://fpsim.foo.com/similarity_search)

"""
import collections
import functools
import urllib
from typing import Union

import pandas
import requests

from schrodinger.infra import licensing

DEFAULT = 'FPSIM_DEFAULT'
DEFAULT_MAX_RESULTS = 100
DEFAULT_SIMILARITY_CUTOFF = 0.5

CORP_ID_SPLITTER = ';:;'
SEARCH_ERROR_STRING = 'SERVER_ERROR_ON_SEARCH'

#===============================================================================
# Functional API
#===============================================================================


[docs]def query(smiles, url=DEFAULT, max_results=DEFAULT_MAX_RESULTS, similarity_cutoff=DEFAULT_SIMILARITY_CUTOFF, dbnames=DEFAULT): """ Perform an FPSim query for a given smiles on a specific server. :param smiles: the query smiles to search against :param url: the base FPsim server URL :param max_results: max number of matches to return :param similarity_cutoff: minimum similarity cutoff for matches (0 - 1.0) :param dbnames: names of databases to search. Default: all public databases :return: a pandas.DataFrame of similar smiles with corresponding corporate ids and similarity scores. """ if dbnames is DEFAULT: dbnames = get_public_databases(url) dbkeys = ['' for _ in dbnames] similarity_cutoff = _convert_cutoff_to_fraction(similarity_cutoff) data = { 'smiles': smiles, 'return_count': str(max_results), 'similarity_cutoff': str(similarity_cutoff), 'dbnames': ','.join(dbnames), 'dbkeys': ','.join(dbkeys) } response = _raw_query(url, data) if not response.ok: raise ServerError(f'Bad server response: {url}') json_results = response.json() if json_results == 'Server error': raise QueryError(f'Problem with query: {data}') if (len(json_results) == 1 and json_results[0][0] == SEARCH_ERROR_STRING): raise SearchError(f'Problem with search: {smiles}') return _json_to_pandas(json_results)
[docs]@functools.lru_cache() def get_public_databases(url=DEFAULT): """ Retrieves a list of database names for the public databases on an FPsim server. :param url: the base FPsim server url. By default this will be the Schrodinger FPsim server. :return: a list of database names, which may be used as the dbnames parameter to a query. :raises requests.exception.ConnectionTimeout: when the get request, which should be instantaneous (up to network delays), takes longer than 3s. """ if url is DEFAULT: # Also performs license check url = licensing.gpusim_current_url( licensing.SimilarityEndpoint.DATABASES) timeout = None else: licensing.licenseExists(licensing.GPUSIMILARITY) url = urllib.parse.urljoin(url, 'dbnames') timeout = 3 response = requests.get(url, {}, timeout=timeout) if not response.ok: raise ServerError(f"Can't fetch db list from {url}") dbnames = response.json() for idx, name in enumerate(dbnames): if name.endswith('.fsim'): dbnames[idx] = name.rstrip('.fsim') return dbnames
#=============================================================================== # Searcher API #===============================================================================
[docs]class FPsimSearcher: """ A search context for FPsim queries. Specify a collection of data sources on the object to perform multiple queries on the same servers and databases. """
[docs] def __init__(self, url=DEFAULT, dbnames=DEFAULT): self._server_dbs = collections.defaultdict(set) if url is not None: self.addSource(url, dbnames)
[docs] def addSource(self, url=DEFAULT, dbnames=DEFAULT): """ Add databases to the searcher specified by URL and database name(s). :param url: the base FPsim server url. By default this will be the Schrodinger FPsim server. :param dbnames: the database name to add for the specified server. This may be a single name or a list of names. """ if dbnames is DEFAULT: dbnames = get_public_databases(url) if isinstance(dbnames, str): dbnames = set(dbnames) if url: self._server_dbs[url].update(dbnames)
[docs] def getSources(self): """ Gets all the sources that have been added to this searcher. :return: a dictionary of {url:dbnames}. """ sources = {} for url, dbnames in self._server_dbs.items(): if url is DEFAULT: url = _get_default_url() sources[url] = dbnames return sources
[docs] def query(self, smiles, max_results=DEFAULT_MAX_RESULTS, similarity_cutoff=DEFAULT_SIMILARITY_CUTOFF): """ Perform a query on all the sources in this searcher. Each server will be queried in the order it was added via addSource until max_results results have been found. The results from each server are simply concatenated, with no de-duplication of results. See module function query() for parameter docs. """ results = _json_to_pandas([]) for url, dbnames in self._server_dbs.items(): df = query(smiles, url, max_results=max_results, similarity_cutoff=similarity_cutoff, dbnames=dbnames) results = pandas.concat([results, df]) max_results = max_results - len(df) if max_results < 1: break return results
#=============================================================================== # Exception classes #===============================================================================
[docs]class ServerError(RuntimeError): pass
[docs]class QueryError(RuntimeError): pass
[docs]class SearchError(RuntimeError): pass
#=============================================================================== # Utility functions #=============================================================================== def _json_to_pandas(json_results): df = pandas.DataFrame(json_results, columns=['corp_ids', 'smiles', 'similarity']) df['corp_ids'] = df['corp_ids'].apply(_parse_corp_id_str) return df def _parse_corp_id_str(corp_id_str): return corp_id_str.split(CORP_ID_SPLITTER) def _get_default_url(): return licensing.gpusim_current_url(licensing.SimilarityEndpoint.FASTSIM) def _raw_query(url, data): if url is DEFAULT: url = _get_default_url() else: url = urllib.parse.urljoin(url, 'similarity_search_json') return requests.post(url, data) def _convert_cutoff_to_fraction(cutoff: Union[int, float]) -> float: """ Return a similarity cutoff between 0 and 1. Converts a percentage cutoff to a fraction. A fractional cutoff is returned untouched if already in the range [0.0, 1.0]. This function is required in order to always pass a fractional similarity cutoff argument into the FPSim search while preserving compatibility with the previous convention of accepting a percentage. :param cutoff: minimum similarity cutoff for matches. Assumed to be a number greater than or equal to 0. :return: A similarity cutoff between 0 and 1. """ # Account for floating point noise epsilon = 1.e-15 if 0 <= cutoff <= 1.0 + epsilon: return min(cutoff, 1.0) return min(cutoff, 100) / 100