Source code for schrodinger.trajectory.prody.pca

# -*- coding: utf-8 -*-

#
# ProDy: A Python Package for Protein Dynamics Analysis
#
# Copyright (C) 2010-2014 University of Pittsburgh
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
"""This module defines classes for principal component analysis (PCA) and
essential dynamics analysis (EDA) calculations."""

import time
from past.utils import old_div

import numpy as np

from .ensemble import Ensemble
from .ensemble import importLA
from .nma import NMA

__all__ = ['PCA', 'EDA']


[docs]class PCA(NMA):
    """A class for Principal Component Analysis (PCA) of conformational
    ensembles. See examples in :ref: `pca`."""

[docs]    def __init__(self, name='Unknown'):

        NMA.__init__(self, name)

[docs]    def setCovariance(self, covariance):
        """Set covariance matrix."""

        if not isinstance(covariance, np.ndarray):
            raise TypeError('covariance must be an ndarray')
        elif not (covariance.ndim == 2 and
                  covariance.shape[0] == covariance.shape[1]):
            raise TypeError('covariance must be square matrix')
        self._reset()
        self._cov = covariance
        self._dof = covariance.shape[0]
        self._n_atoms = old_div(self._dof, 3)
        self._trace = self._cov.trace()

[docs]    def buildCovariance(self, coordsets, **kwargs):
        """Build a covariance matrix for *coordsets* using mean coordinates
        as the reference.  *coordsets* argument may be one of the following:

        * :class: `.Ensemble`
        * :class: `numpy.ndarray` with shape ``(n_csets, n_atoms, 3)``

        For ensemble and trajectory objects, ``update_coords=True`` argument
        can be used to set the mean coordinates as the coordinates of the
        object.

        When *coordsets* is a trajectory object, such as :class: `.DCDFile`,
        covariance will be built by superposing frames onto the reference
        coordinate set (see :meth: `.Frame.superpose`).  If frames are already
        aligned, use ``aligned=True`` argument to skip this step.
        """
        mean = None
        weights = None
        ensemble = None
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or
                    coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')

        elif isinstance(coordsets, Ensemble):
            ensemble = coordsets
            coordsets = coordsets._getCoordsets()

        update_coords = bool(kwargs.get('update_coords', False))

        n_confs = coordsets.shape[0]
        if n_confs < 3:
            raise ValueError('coordsets must have more than 3 coordinate '
                             'sets')
        n_atoms = coordsets.shape[1]
        if n_atoms < 3:
            raise ValueError('coordsets must have more than 3 atoms')
        dof = n_atoms * 3
        print('Covariance is calculated using coordinate sets.')
        s = (n_confs, dof)
        if weights is None:
            if coordsets.dtype == float:
                self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1)
            else:
                cov = np.zeros((dof, dof))
                coordsets = coordsets.reshape((n_confs, dof))
                mean = coordsets.mean(0)
                # print('Building covariance', n_confs, '_prody_pca')
                for i, coords in enumerate(coordsets.reshape(s)):
                    deviations = coords - mean
                    cov += np.outer(deviations, deviations)
                cov /= n_confs
                self._cov = cov
        else:
            # PDB ensemble case
            mean = np.zeros((n_atoms, 3))
            for i, coords in enumerate(coordsets):
                mean += coords * weights[i]
            mean /= weights.sum(0)
            d_xyz = ((coordsets - mean) * weights).reshape(s)
            divide_by = weights.astype(float).repeat(3, axis=2).reshape(s)
            self._cov = old_div(np.dot(d_xyz.T, d_xyz),
                                np.dot(divide_by.T, divide_by))
        if update_coords and ensemble is not None:
            if mean is None:
                mean = coordsets.mean(0)
            ensemble.setCoords(mean)

        self._trace = self._cov.trace()
        self._dof = dof
        self._n_atoms = n_atoms

[docs]    def calcModes(self, n_modes=20, turbo=True):
        """Calculate principal (or essential) modes.  This method uses
        :func: `scipy.linalg.eigh`, or :func: `numpy.linalg.eigh`, function
        to diagonalize the covariance matrix.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate,
            default is 20, for **None** all modes will be calculated
        :type n_modes: int

        :arg turbo: when available, use a memory intensive but faster way to
            calculate modes, default is **True**
        :type turbo: bool"""

        linalg = importLA()
        if self._cov is None:
            raise ValueError('covariance matrix is not built or set')
        start = time.time()
        dof = self._dof
        if linalg.__package__.startswith('scipy'):
            if n_modes is None:
                eigvals = None
                n_modes = dof
            else:
                n_modes = int(n_modes)
                if n_modes >= self._dof:
                    eigvals = None
                    n_modes = dof
                else:
                    eigvals = (dof - n_modes, dof - 1)
            values, vectors = linalg.eigh(self._cov,
                                          turbo=turbo,
                                          eigvals=eigvals)
        else:
            if n_modes is not None:
                print('Scipy is not found, all modes are calculated.')
            values, vectors = linalg.eigh(self._cov)
        # Order by descending SV
        revert = list(range(len(values) - 1, -1, -1))
        values = values[revert]
        vectors = vectors[:, revert]
        which = values > 1e-8
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._n_modes = len(self._eigvals)

[docs]    def performSVD(self, coordsets):
        """
        Calculate principal modes using singular value decomposition (SVD).
        *coordsets* argument may be a :class: `.Ensemble`, or :class:
        `numpy.ndarray` instance. If *coordsets* is a numpy array, its shape
        must be ``(n_csets, n_atoms, 3)``. Note that coordinate sets must be
        aligned prior to SVD calculations.

        This is a considerably faster way of performing PCA calculations
        compared to eigenvalue decomposition of covariance matrix, but is
        an approximate method when heterogeneous datasets are analyzed.
        Covariance method should be preferred over this one for analysis of
        ensembles with missing atomic data.  See :ref: `pca-xray-calculations`
        example for comparison of results from SVD and covariance methods.
        """

        linalg = importLA()

        start = time.time()
        if not isinstance(coordsets, (Ensemble, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Numpy '
                            'array instance')
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or
                    coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
            deviations = coordsets - coordsets.mean(0)
        else:
            if isinstance(coordsets, Ensemble):
                deviations = coordsets.getDeviations()

        n_confs = deviations.shape[0]
        if n_confs < 3:
            raise ValueError('coordsets must have more than 3 coordinate sets')
        n_atoms = deviations.shape[1]
        if n_atoms < 3:
            raise ValueError('coordsets must have more than 3 atoms')

        dof = n_atoms * 3
        deviations = deviations.reshape((n_confs, dof)).T

        vectors, values, self._temp = linalg.svd(deviations,
                                                 full_matrices=False)
        values = old_div((values**2), n_confs)
        self._dof = dof
        self._n_atoms = n_atoms
        which = values > 1e-18
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._trace = self._vars.sum()
        self._n_modes = len(self._eigvals)

[docs]    def addEigenpair(self, eigenvector, eigenvalue=None):
        """Add eigen *vector* and eigen *value* pair(s) to the instance.
        If eigen *value* is omitted, it will be set to 1.  Eigenvalues
        are set as variances."""

        NMA.addEigenpair(self, eigenvector, eigenvalue)
        self._vars = self._eigvals

[docs]    def setEigens(self, vectors, values=None):
        """Set eigen *vectors* and eigen *values*.  If eigen *values* are
        omitted, they will be set to 1.  Eigenvalues are set as variances."""

        NMA.setEigens(self, vectors, values)
        self._vars = self._eigvals


[docs]class EDA(PCA):
    """
    A class for Essential Dynamics Analysis (EDA) [AA93]_.

    See examples in :ref: `eda`.

    .. [AA93] Amadei A, Linssen AB, Berendsen HJ. Essential dynamics of
       proteins. *Proteins* **1993** 17(4):412-25."""