Source code for secml.ml.features.reduction.c_reducer_pca

"""
.. module:: CPCA
   :synopsis: Principal Component Analysis (PCA)

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from secml.array import CArray
from secml.ml.features.reduction import CReducer
from secml.utils.mixed_utils import check_is_fitted

__all__ = ['CPCA']


[docs]class CPCA(CReducer):
    """Principal Component Analysis (PCA).

    Parameters
    ----------
    preprocess : CPreProcess or str or None, optional
        Features preprocess to be applied to input data.
        Can be a CPreProcess subclass or a string with the type of the
        desired preprocessor. If None, input data is used as is.

    Attributes
    ----------
    class_type : 'pca'

    """
    __class_type = 'pca'

    def __init__(self, n_components=None, preprocess=None):
        """Principal Component Analysis (PCA)

        Apply a linear transformation at data, project it into
        a new spaces where total variance of data is maximized
        (axis are the eigenvector of your matrix). If number of
        components is less than number of patterns is useful for
        project data into a low dimensional space.

        This implementation uses the scipy.linalg implementation
        of the singular value reduction. It only works for
        dense arrays and is not scalable to large dimensional data.

        Parameters
        ----------
        n_components : None or int, optional
            Number of components to keep. If n_components is not set::
                n_components == min(n_samples, n_features)
            If ``0 < n_components < 1``, select the number of components
            such that the amount of variance that needs to be explained
            is greater than the percentage specified by n_components.

        Notes
        -----
        While this implementation transparently works for sparse format
        arrays too, data is converted to dense form internally and thus
        memory consumption can be high.

        Examples
        --------
        >>> from secml.array import CArray
        >>> from secml.ml.features.reduction import CPCA

        >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]])
        >>> CPCA().fit_transform(array)
        CArray(3, 3)(dense: [[-4.078722e+00 -2.478266e+00  1.855417e-17] [-2.727232e+00  2.829603e+00  6.708859e-17] [ 6.805954e+00 -3.513362e-01 -1.757349e-16]])


        """
        self.n_components = n_components
        self._eigenvec = None
        self._eigenval = None
        self._components = None
        self._mean = None
        self._explained_variance = None
        self._explained_variance_ratio = None

        super(CPCA, self).__init__(preprocess=preprocess)

    @property
    def eigenval(self):
        """Eigenvalues estimated from the training data."""
        return self._eigenval

    @property
    def eigenvec(self):
        """Eigenvectors estimated from the training data."""
        return self._eigenvec

    @property
    def components(self):
        """Eigenvectors of inverse training array."""
        return self._components

    @property
    def mean(self):
        """Per-feature empirical mean, estimated from the training data."""
        return self._mean

    @property
    def explained_variance(self):
        """Variance explained by each of the selected components."""
        return self._explained_variance

    @property
    def explained_variance_ratio(self):
        """Percentage of variance explained by each of the selected components.

        If n_components is None, then all components are stored and
        the sum of explained variances is equal to 1.0

        """
        return self._explained_variance_ratio

    def _check_is_fitted(self):
        """Check if the preprocessor is trained (fitted).

        Raises
        ------
        NotFittedError
            If the preprocessor is not fitted.

        """
        check_is_fitted(self, ['components', 'mean'])

    def _fit(self, x, y=None):
        """Fit the PCA using input data.

        Parameters
        ----------
        x : array_like
            Training data, 2-Dim array like object with shape
            (n_patterns, n_features), where each row is a pattern
            of n_features columns.
        y : CArray or None, optional
            Flat array with the label of each pattern.
            Can be None if not required by the preprocessing algorithm.

        Returns
        -------
        CPCA
            Instance of the trained transformer.

        Examples
        --------
        >>> from secml.array import CArray
        >>> from secml.ml.features.reduction import CPCA

        >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]])
        >>> pca = CPCA().fit(array)
        >>> pca.eigenval
        CArray(3,)(dense: [8.390159e+00 3.777816e+00 1.909570e-17])
        >>> pca.eigenvec
        CArray(3, 3)(dense: [[-0.486132 -0.656005  0.57735 ] [-0.325051  0.749005  0.57735 ] [ 0.811183 -0.093     0.57735 ]])
        >>> pca.explained_variance
        CArray(3,)(dense: [3.519739e+01 7.135946e+00 1.823230e-34])
        >>> pca.explained_variance_ratio
        CArray(3,)(dense: [8.314343e-01 1.685657e-01 4.306842e-36])

        """
        data_carray = CArray(x).todense().atleast_2d()
        # Max number of components is the number of patterns available (rows)
        n_samples = data_carray.shape[0]
        n_features = data_carray.shape[1]

        if self.n_components is None:
            self.n_components = min(n_samples, n_features)
        else:
            if self.n_components > n_samples:
                raise ValueError("maximum number of components is {:}".format(n_samples))

        # Centering training data
        self._mean = data_carray.mean(axis=0, keepdims=False)
        data_carray -= self.mean

        # Performing training of PCA (used by KernelPCA too)
        return self._svd_train(data_carray)

    def _svd_train(self, data_carray):
        """Linear PCA training routine, used also by KernelPCA."""

        # Computing SVD reduction
        from numpy import linalg
        from sklearn.utils.extmath import svd_flip
        u, s, v = linalg.svd(data_carray.atleast_2d().tondarray(),
                             full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        u, v = svd_flip(u, v)

        eigenvec = CArray(u)
        eigenval = CArray(s)
        components = CArray(v)

        # Now we sort the eigenvalues/eigenvectors
        idx = (-eigenval).argsort(axis=None)
        eigenval = CArray(eigenval[idx])
        eigenvec = CArray(eigenvec[:, idx]).atleast_2d()
        components = CArray(components[idx, :]).atleast_2d()
        # percentage of variance explained by each component
        explained_variance = (eigenval ** 2) / (data_carray.shape[0] - 1)
        explained_variance_ratio = explained_variance / explained_variance.sum()

        if 0 < self.n_components < 1.0:
            # number of components for which the cumulated explained variance
            # percentage is superior to the desired threshold
            ratio_cumsum = explained_variance_ratio.cumsum()
            self.n_components = CArray(ratio_cumsum < self.n_components).sum() + 1

        # Consider only n_components
        self._eigenval = CArray(eigenval[:self.n_components])
        self._eigenvec = CArray(eigenvec[:, :self.n_components])
        self._components = CArray(components[:self.n_components, :])

        # storing explained variance of n_components only
        self._explained_variance = explained_variance[:self.n_components]
        self._explained_variance_ratio = explained_variance_ratio[:self.n_components]

        return self

    def _forward(self, x):
        """Apply the reduction algorithm on data.

        Parameters
        ----------
        x : array_like
            Array to be transformed. 2-D array object of shape
            (n_patterns, n_features). n_features must be equal to
            n_components parameter set before or during training.

        Returns
        --------
        CArray
            Input data mapped to PCA space.

        Examples
        --------
        >>> from secml.array import CArray
        >>> from secml.ml.features.reduction import CPCA

        >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]])
        >>> pca = CPCA().fit(array)
        >>> pca.transform(CArray.concatenate(array, [4., 2., -6.], axis=0))
        CArray(4, 3)(dense: [[-4.078722e+00 -2.478266e+00  1.855417e-17] [-2.727232e+00  2.829603e+00  6.708859e-17] [ 6.805954e+00 -3.513362e-01 -1.757349e-16] [ 3.209152e+00  1.129680e+00  3.296909e+00]])


        >>> pca.transform([4., 2.])
        Traceback (most recent call last):
            ...
        ValueError: array to transform must have 3 features (columns).

        """
        data_carray = CArray(x).todense().atleast_2d()
        if data_carray.shape[1] != self.mean.size:
            raise ValueError("array to transform must have {:} "
                             "features (columns).".format(self.mean.size))

        out = CArray((data_carray - self.mean).dot(self._components.T))
        return out.atleast_2d() if x.ndim >= 2 else out

    def _inverse_transform(self, x):
        """Map data back to its original space.

        Parameters
        ----------
        x : CArray
            Array to transform back to its original space.

        Returns
        --------
        CArray
            Input array mapped back to its original space.

        Examples
        --------
        >>> from secml.array import CArray
        >>> from secml.ml.features.reduction import CPCA

        >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]])
        >>> pca = CPCA().fit(array)
        >>> array_pca = pca.transform(array)
        >>> pca.inverse_transform(array_pca).round(6)
        CArray(3, 3)(dense: [[ 1. -0.  2.] [ 2.  5. -0.] [-0.  1. -9.]])

        """
        data_carray = CArray(x).atleast_2d()
        if data_carray.shape[1] != self.n_components:
            raise ValueError("array to revert must have {:} "
                             "features (columns).".format(self.n_components))

        out = CArray(data_carray.dot(self._components) + self.mean)

        return out.atleast_2d() if x.ndim >= 2 else out