Source code for schrodinger.trajectory.prody.pca

# -*- coding: utf-8 -*-

#
# ProDy: A Python Package for Protein Dynamics Analysis
#
# Copyright (C) 2010-2014 University of Pittsburgh
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
"""This module defines classes for principal component analysis (PCA) and
essential dynamics analysis (EDA) calculations."""

import time
from past.utils import old_div

import numpy as np

from .ensemble import Ensemble
from .ensemble import importLA
from .nma import NMA

__all__ = ['PCA', 'EDA']


[docs]class PCA(NMA): """A class for Principal Component Analysis (PCA) of conformational ensembles. See examples in :ref: `pca`."""
[docs] def __init__(self, name='Unknown'): NMA.__init__(self, name)
[docs] def setCovariance(self, covariance): """Set covariance matrix.""" if not isinstance(covariance, np.ndarray): raise TypeError('covariance must be an ndarray') elif not (covariance.ndim == 2 and covariance.shape[0] == covariance.shape[1]): raise TypeError('covariance must be square matrix') self._reset() self._cov = covariance self._dof = covariance.shape[0] self._n_atoms = old_div(self._dof, 3) self._trace = self._cov.trace()
[docs] def buildCovariance(self, coordsets, **kwargs): """Build a covariance matrix for *coordsets* using mean coordinates as the reference. *coordsets* argument may be one of the following: * :class: `.Ensemble` * :class: `numpy.ndarray` with shape ``(n_csets, n_atoms, 3)`` For ensemble and trajectory objects, ``update_coords=True`` argument can be used to set the mean coordinates as the coordinates of the object. When *coordsets* is a trajectory object, such as :class: `.DCDFile`, covariance will be built by superposing frames onto the reference coordinate set (see :meth: `.Frame.superpose`). If frames are already aligned, use ``aligned=True`` argument to skip this step. """ mean = None weights = None ensemble = None if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') elif isinstance(coordsets, Ensemble): ensemble = coordsets coordsets = coordsets._getCoordsets() update_coords = bool(kwargs.get('update_coords', False)) n_confs = coordsets.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate ' 'sets') n_atoms = coordsets.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 print('Covariance is calculated using coordinate sets.') s = (n_confs, dof) if weights is None: if coordsets.dtype == float: self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1) else: cov = np.zeros((dof, dof)) coordsets = coordsets.reshape((n_confs, dof)) mean = coordsets.mean(0) # print('Building covariance', n_confs, '_prody_pca') for i, coords in enumerate(coordsets.reshape(s)): deviations = coords - mean cov += np.outer(deviations, deviations) cov /= n_confs self._cov = cov else: # PDB ensemble case mean = np.zeros((n_atoms, 3)) for i, coords in enumerate(coordsets): mean += coords * weights[i] mean /= weights.sum(0) d_xyz = ((coordsets - mean) * weights).reshape(s) divide_by = weights.astype(float).repeat(3, axis=2).reshape(s) self._cov = old_div(np.dot(d_xyz.T, d_xyz), np.dot(divide_by.T, divide_by)) if update_coords and ensemble is not None: if mean is None: mean = coordsets.mean(0) ensemble.setCoords(mean) self._trace = self._cov.trace() self._dof = dof self._n_atoms = n_atoms
[docs] def calcModes(self, n_modes=20, turbo=True): """Calculate principal (or essential) modes. This method uses :func: `scipy.linalg.eigh`, or :func: `numpy.linalg.eigh`, function to diagonalize the covariance matrix. :arg n_modes: number of non-zero eigenvalues/vectors to calculate, default is 20, for **None** all modes will be calculated :type n_modes: int :arg turbo: when available, use a memory intensive but faster way to calculate modes, default is **True** :type turbo: bool""" linalg = importLA() if self._cov is None: raise ValueError('covariance matrix is not built or set') start = time.time() dof = self._dof if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = dof else: n_modes = int(n_modes) if n_modes >= self._dof: eigvals = None n_modes = dof else: eigvals = (dof - n_modes, dof - 1) values, vectors = linalg.eigh(self._cov, turbo=turbo, eigvals=eigvals) else: if n_modes is not None: print('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._cov) # Order by descending SV revert = list(range(len(values) - 1, -1, -1)) values = values[revert] vectors = vectors[:, revert] which = values > 1e-8 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._n_modes = len(self._eigvals)
[docs] def performSVD(self, coordsets): """ Calculate principal modes using singular value decomposition (SVD). *coordsets* argument may be a :class: `.Ensemble`, or :class: `numpy.ndarray` instance. If *coordsets* is a numpy array, its shape must be ``(n_csets, n_atoms, 3)``. Note that coordinate sets must be aligned prior to SVD calculations. This is a considerably faster way of performing PCA calculations compared to eigenvalue decomposition of covariance matrix, but is an approximate method when heterogeneous datasets are analyzed. Covariance method should be preferred over this one for analysis of ensembles with missing atomic data. See :ref: `pca-xray-calculations` example for comparison of results from SVD and covariance methods. """ linalg = importLA() start = time.time() if not isinstance(coordsets, (Ensemble, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Numpy ' 'array instance') if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') deviations = coordsets - coordsets.mean(0) else: if isinstance(coordsets, Ensemble): deviations = coordsets.getDeviations() n_confs = deviations.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate sets') n_atoms = deviations.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 deviations = deviations.reshape((n_confs, dof)).T vectors, values, self._temp = linalg.svd(deviations, full_matrices=False) values = old_div((values**2), n_confs) self._dof = dof self._n_atoms = n_atoms which = values > 1e-18 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._trace = self._vars.sum() self._n_modes = len(self._eigvals)
[docs] def addEigenpair(self, eigenvector, eigenvalue=None): """Add eigen *vector* and eigen *value* pair(s) to the instance. If eigen *value* is omitted, it will be set to 1. Eigenvalues are set as variances.""" NMA.addEigenpair(self, eigenvector, eigenvalue) self._vars = self._eigvals
[docs] def setEigens(self, vectors, values=None): """Set eigen *vectors* and eigen *values*. If eigen *values* are omitted, they will be set to 1. Eigenvalues are set as variances.""" NMA.setEigens(self, vectors, values) self._vars = self._eigvals
[docs]class EDA(PCA): """ A class for Essential Dynamics Analysis (EDA) [AA93]_. See examples in :ref: `eda`. .. [AA93] Amadei A, Linssen AB, Berendsen HJ. Essential dynamics of proteins. *Proteins* **1993** 17(4):412-25."""