Source code for schrodinger.application.canvas.similarity

"""

Support for Canvas fingerprint similarity operations.

There are classes to perform similarity calculations and to support command
line interfaces for similarity options.

Copyright Schrodinger, LLC. All rights reserved.

"""

# Contributors: Quentin McDonald

from collections import OrderedDict
from textwrap import dedent

from schrodinger.infra import canvas


[docs]class CanvasSimilarityNotImplemented(Exception): """ For CanvasSimilarity method not yet implemented """
[docs] def __init__(self, *args): Exception.__init__(self, *args)
############# Canvas classes begin here ##################################
[docs]class CanvasFingerprintSimilarity(object): """ A class which encapsulates the Canvas fingerprint similarity tools. This includes recording and implementing the available similarity metrics. Currently the metrics are implemented at the Python level as this demonstrates how the fingerprint manipulations are performed however ultimately these will be replaced with wrappers to the underlying Canvaslibs tools which should be more efficient """ SIMILARITY_METRICS = [ "Tanimoto", "Modified Tanimoto", "Hamming", "Soergel", "McConnaughey", "Dice", "Cosine", "Simpson", "Petke", "Kulczynski", "Euclidean", "Tversky", "Buser", "Variance", "Size", "Shape", "Pattern Difference", "Hamann", "Matching", "Pearson", "Rogers Tanimoto", "Yule", "Dixon", "MinMax" ]
[docs] def __init__(self, logger): """ Initialize the similarity class """ self._logger = logger self.SIMILARITY_METRICS.sort() # Create a mapping between "short" names (as might be used # in a command line application) and the full metric names. self.SHORT_SIMILARITY_METRICS = [] self.SHORT_TO_LONG_NAMES = {} for metric in self.SIMILARITY_METRICS: # Convert spaces to underscores short = metric.lower().replace(" ", "_") self.SHORT_SIMILARITY_METRICS.append(short) self.SHORT_TO_LONG_NAMES[short] = metric # Initialize the Tversky alpha and Beta self._alpha = 0.5 self._beta = 0.5 # Create a dispatch table which associates each metric name # with a callable method to implement it: self._metric_funcs = OrderedDict() self._metric_funcs["Tanimoto"] = self.simTanimoto self._metric_funcs["Tversky"] = self.simTversky self._metric_funcs["Soergel"] = self.simSoergel self._metric_funcs["McConnaughey"] = self.simMcConnaughey self._metric_funcs["Dice"] = self.simDice self._metric_funcs["Cosine"] = self.simCosine self._metric_funcs["Simpson"] = self.simSimpson self._metric_funcs["Petke"] = self.simPetke self._metric_funcs["Kulczynski"] = self.simKulczynski self._metric_funcs["Buser"] = self.simBuser self._metric_funcs["Hamann"] = self.simHamann self._metric_funcs["Matching"] = self.simMatching self._metric_funcs["Pearson"] = self.simPearson self._metric_funcs["Rogers Tanimoto"] = self.simRogersTanimoto self._metric_funcs["Yule"] = self.simYule self._metric_funcs["Euclidean"] = self.simEuclidean self._metric_funcs["Hamming"] = self.simHamming self._metric_funcs["Modified Tanimoto"] = self.simModifiedTanimoto self._metric_funcs["Pattern Difference"] = self.simPatternDifference self._metric_funcs["Shape"] = self.simShape self._metric_funcs["Size"] = self.simSize self._metric_funcs["Variance"] = self.simVariance self._metric_funcs["Dixon"] = self.simDixon self._metric_funcs["MinMax"] = self.simMinMax # Default is Tanimoto self.setMetric("Tanimoto") # Create a table which associates metric names with # enums used in pairwise distance matrix generation self._metric_style = OrderedDict() self._metric_style["Tanimoto"] = canvas.ChmMetricStyle_tanimoto self._metric_style[ "Modified Tanimoto"] = canvas.ChmMetricStyle_modifiedTanimoto self._metric_style["Hamming"] = canvas.ChmMetricStyle_hamming self._metric_style["Soergel"] = canvas.ChmMetricStyle_soergel self._metric_style["McConnaughey"] = canvas.ChmMetricStyle_mcConnaughey self._metric_style["Dice"] = canvas.ChmMetricStyle_dice self._metric_style["Cosine"] = canvas.ChmMetricStyle_cosine self._metric_style["Simpson"] = canvas.ChmMetricStyle_simpson self._metric_style["Petke"] = canvas.ChmMetricStyle_petke self._metric_style["Kulczynski"] = canvas.ChmMetricStyle_kulczynski self._metric_style["Euclidean"] = canvas.ChmMetricStyle_euclidean self._metric_style["Tversky"] = canvas.ChmMetricStyle_tversky self._metric_style["Buser"] = canvas.ChmMetricStyle_buser self._metric_style["Variance"] = canvas.ChmMetricStyle_variance self._metric_style["Size"] = canvas.ChmMetricStyle_size self._metric_style["Shape"] = canvas.ChmMetricStyle_shape self._metric_style[ "Pattern Difference"] = canvas.ChmMetricStyle_patternDifference self._metric_style["Hamann"] = canvas.ChmMetricStyle_hamann self._metric_style["Matching"] = canvas.ChmMetricStyle_matching self._metric_style["Pearson"] = canvas.ChmMetricStyle_pearson self._metric_style[ "Rogers Tanimoto"] = canvas.ChmMetricStyle_rogersTanimoto self._metric_style["Yule"] = canvas.ChmMetricStyle_yule self._metric_style["Dixon"] = canvas.ChmMetricStyle_dixon self._metric_style["MinMax"] = canvas.ChmMetricStyle_cminmax
[docs] def debug(self, output): """ Wrapper for debug logging, just to simplify logging """ self._logger.debug(output)
[docs] def getDescription(self): """ Returns a string representing a summary of the current similarity settings """ desc = "%s" % (self._current_metric) return desc
[docs] def setMetric(self, metric_name): """ Set the current metric based on the metric name """ # Convert to the long name if necessary: name = self.SHORT_TO_LONG_NAMES.get(metric_name, metric_name) if (name in self.SIMILARITY_METRICS): self.debug("FPSim - setting metric to %s" % name) self._current_metric = name self._current_metric_func = self._metric_funcs.get(name, None) else: raise Exception("Unknown similarity metric name: %s" % metric_name)
[docs] def getMetric(self): """ Returns the currently set metric """ return self._current_metric
[docs] def calculateSimilarity(self, fp1, fp2): """ Calculate the similarity between the two fingerprints and return the value. The similarity is calculated using the similarity method which is current for this object (as set by setMetric()) """ if not self._current_metric_func: raise CanvasSimilarityNotImplemented( "No implementation currently available for %s" % (self._current_metric)) else: return self._current_metric_func(fp1, fp2)
[docs] def setAlpha(self, alpha): """ Set the value of Alpha as used in the tversky similarity """ self._alpha = alpha
[docs] def setBeta(self, beta): """ Set the value of Alpha as used in the tversky similarity """ self._beta = beta
[docs] def getAlpha(self): """ Get the value of Alpha as used in the tversky similarity """ return self._alpha
[docs] def getBeta(self): """ Get the value of Alpha as used in the tversky similarity """ return self._beta
[docs] def getMetricStyle(self): """ Return a value corresponding to the current metric style. This is used in difference matrix construction as part of clustering """ return self._metric_style[self._current_metric]
def _getABC(self, fp1, fp2): """ Most similarity methods use three quantities calculated from the input fingerprints. This private method calculates these so as to avoid duplicated code everywhere. The a, b and c are returned as a tuple of floats """ a = float(fp1.count()) b = float(fp2.count()) c = fp1.countCommonOn(fp2) return (a, b, c) # Similarity implementations follow:
[docs] def simHamming(self, fp1, fp2): return fp1.distHamming(fp2)
[docs] def simModifiedTanimoto(self, fp1, fp2): return fp1.simModifiedTanimoto(fp2)
[docs] def simPatternDifference(self, fp1, fp2): return fp1.distPatternDifference(fp2)
[docs] def simShape(self, fp1, fp2): return fp1.distShape(fp2)
[docs] def simSize(self, fp1, fp2): return fp1.distSize(fp2)
[docs] def simVariance(self, fp1, fp2): return fp1.distVariance(fp2)
[docs] def simEuclidean(self, fp1, fp2): return fp1.distEuclidean(fp2)
[docs] def simTanimoto(self, fp1, fp2): return fp1.simTanimoto(fp2)
[docs] def simTversky(self, fp1, fp2): return fp1.simTversky(fp2, self._alpha, self._beta)
[docs] def simSoergel(self, fp1, fp2): return fp1.distSoergel(fp2)
[docs] def simMcConnaughey(self, fp1, fp2): return fp1.simMcConnaughey(fp2)
[docs] def simDice(self, fp1, fp2): return fp1.simDice(fp2)
[docs] def simCosine(self, fp1, fp2): return fp1.simCosine(fp2)
[docs] def simSimpson(self, fp1, fp2): return fp1.simSimpson(fp2)
[docs] def simPetke(self, fp1, fp2): return fp1.simPetke(fp2)
[docs] def simKulczynski(self, fp1, fp2): return fp1.simKulczynski(fp2)
[docs] def simBuser(self, fp1, fp2): return fp1.simBuser(fp2)
[docs] def simHamann(self, fp1, fp2): return fp1.simHamann(fp2)
[docs] def simMatching(self, fp1, fp2): return fp1.simMatching(fp2)
[docs] def simPearson(self, fp1, fp2): return fp1.simPearson(fp2)
[docs] def simRogersTanimoto(self, fp1, fp2): return fp1.simRogersTanimoto(fp2)
[docs] def simYule(self, fp1, fp2): return fp1.simYule(fp2)
[docs] def simDixon(self, fp1, fp2): return fp1.distDixon(fp2)
[docs] def simMinMax(self, fp1, fp2): return fp1.simMinMax(fp2)
############# Command line specific classes start here:
[docs]class CanvasFingerprintSimilarityCLI(CanvasFingerprintSimilarity): """ A subclass of the CanvasFingerprintSimilarity class which supports operations from the command line. In particular the parsing and applying of options and the printing of a description of the available similarity metrics """
[docs] def __init__(self, logger): super(CanvasFingerprintSimilarityCLI, self).__init__(logger)
[docs] def addOptions(self, parser): """ Add options for similarity type, alpha and beta """ parser.add_argument("-sim_type", action="store", type=str, default="tanimoto", choices=self.SHORT_SIMILARITY_METRICS, metavar="type", help="Similarity metric to be used") parser.add_argument("-sim_alpha", action="store", metavar="<alpha>", default=0.5, help="Alpha for Tversky similarity (default = 0.5)") parser.add_argument("-sim_beta", action="store", metavar="<alpha>", default=0.5, help="Beta for Tversky similarity (default = 0.5)")
[docs] def parseOptions(self, options): """ Examine the options and set the internal state to reflect them """ self.setMetric(options.sim_type) self.setAlpha(float(options.sim_alpha)) self.setBeta(float(options.sim_beta))
[docs] def getSimilarityMetricDescription(self): """ Return a string which contains a description available similarity """ desc = """ Available similarity metrics are : \n""" for metric in self.SHORT_SIMILARITY_METRICS: desc = "%s %s (%s)\n" % ( desc, self.SHORT_TO_LONG_NAMES[metric], metric) return dedent(desc)