Source code for secml.ml.features.reduction.c_reducer_pca

"""
.. module:: CPCA
   :synopsis: Principal Component Analysis (PCA)

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from secml.array import CArray
from secml.ml.features.reduction import CReducer
from secml.utils.mixed_utils import check_is_fitted

__all__ = ['CPCA']


[docs]class CPCA(CReducer): """Principal Component Analysis (PCA). Parameters ---------- preprocess : CPreProcess or str or None, optional Features preprocess to be applied to input data. Can be a CPreProcess subclass or a string with the type of the desired preprocessor. If None, input data is used as is. Attributes ---------- class_type : 'pca' """ __class_type = 'pca' def __init__(self, n_components=None, preprocess=None): """Principal Component Analysis (PCA) Apply a linear transformation at data, project it into a new spaces where total variance of data is maximized (axis are the eigenvector of your matrix). If number of components is less than number of patterns is useful for project data into a low dimensional space. This implementation uses the scipy.linalg implementation of the singular value reduction. It only works for dense arrays and is not scalable to large dimensional data. Parameters ---------- n_components : None or int, optional Number of components to keep. If n_components is not set:: n_components == min(n_samples, n_features) If ``0 < n_components < 1``, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. Notes ----- While this implementation transparently works for sparse format arrays too, data is converted to dense form internally and thus memory consumption can be high. Examples -------- >>> from secml.array import CArray >>> from secml.ml.features.reduction import CPCA >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]]) >>> CPCA().fit_transform(array) CArray(3, 3)(dense: [[-4.078722e+00 -2.478266e+00 1.855417e-17] [-2.727232e+00 2.829603e+00 6.708859e-17] [ 6.805954e+00 -3.513362e-01 -1.757349e-16]]) """ self.n_components = n_components self._eigenvec = None self._eigenval = None self._components = None self._mean = None self._explained_variance = None self._explained_variance_ratio = None super(CPCA, self).__init__(preprocess=preprocess) @property def eigenval(self): """Eigenvalues estimated from the training data.""" return self._eigenval @property def eigenvec(self): """Eigenvectors estimated from the training data.""" return self._eigenvec @property def components(self): """Eigenvectors of inverse training array.""" return self._components @property def mean(self): """Per-feature empirical mean, estimated from the training data.""" return self._mean @property def explained_variance(self): """Variance explained by each of the selected components.""" return self._explained_variance @property def explained_variance_ratio(self): """Percentage of variance explained by each of the selected components. If n_components is None, then all components are stored and the sum of explained variances is equal to 1.0 """ return self._explained_variance_ratio def _check_is_fitted(self): """Check if the preprocessor is trained (fitted). Raises ------ NotFittedError If the preprocessor is not fitted. """ check_is_fitted(self, ['components', 'mean']) def _fit(self, x, y=None): """Fit the PCA using input data. Parameters ---------- x : array_like Training data, 2-Dim array like object with shape (n_patterns, n_features), where each row is a pattern of n_features columns. y : CArray or None, optional Flat array with the label of each pattern. Can be None if not required by the preprocessing algorithm. Returns ------- CPCA Instance of the trained transformer. Examples -------- >>> from secml.array import CArray >>> from secml.ml.features.reduction import CPCA >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]]) >>> pca = CPCA().fit(array) >>> pca.eigenval CArray(3,)(dense: [8.390159e+00 3.777816e+00 1.909570e-17]) >>> pca.eigenvec CArray(3, 3)(dense: [[-0.486132 -0.656005 0.57735 ] [-0.325051 0.749005 0.57735 ] [ 0.811183 -0.093 0.57735 ]]) >>> pca.explained_variance CArray(3,)(dense: [3.519739e+01 7.135946e+00 1.823230e-34]) >>> pca.explained_variance_ratio CArray(3,)(dense: [8.314343e-01 1.685657e-01 4.306842e-36]) """ data_carray = CArray(x).todense().atleast_2d() # Max number of components is the number of patterns available (rows) n_samples = data_carray.shape[0] n_features = data_carray.shape[1] if self.n_components is None: self.n_components = min(n_samples, n_features) else: if self.n_components > n_samples: raise ValueError("maximum number of components is {:}".format(n_samples)) # Centering training data self._mean = data_carray.mean(axis=0, keepdims=False) data_carray -= self.mean # Performing training of PCA (used by KernelPCA too) return self._svd_train(data_carray) def _svd_train(self, data_carray): """Linear PCA training routine, used also by KernelPCA.""" # Computing SVD reduction from numpy import linalg from sklearn.utils.extmath import svd_flip u, s, v = linalg.svd(data_carray.atleast_2d().tondarray(), full_matrices=False) # flip eigenvectors' sign to enforce deterministic output u, v = svd_flip(u, v) eigenvec = CArray(u) eigenval = CArray(s) components = CArray(v) # Now we sort the eigenvalues/eigenvectors idx = (-eigenval).argsort(axis=None) eigenval = CArray(eigenval[idx]) eigenvec = CArray(eigenvec[:, idx]).atleast_2d() components = CArray(components[idx, :]).atleast_2d() # percentage of variance explained by each component explained_variance = (eigenval ** 2) / (data_carray.shape[0] - 1) explained_variance_ratio = explained_variance / explained_variance.sum() if 0 < self.n_components < 1.0: # number of components for which the cumulated explained variance # percentage is superior to the desired threshold ratio_cumsum = explained_variance_ratio.cumsum() self.n_components = CArray(ratio_cumsum < self.n_components).sum() + 1 # Consider only n_components self._eigenval = CArray(eigenval[:self.n_components]) self._eigenvec = CArray(eigenvec[:, :self.n_components]) self._components = CArray(components[:self.n_components, :]) # storing explained variance of n_components only self._explained_variance = explained_variance[:self.n_components] self._explained_variance_ratio = explained_variance_ratio[:self.n_components] return self def _forward(self, x): """Apply the reduction algorithm on data. Parameters ---------- x : array_like Array to be transformed. 2-D array object of shape (n_patterns, n_features). n_features must be equal to n_components parameter set before or during training. Returns -------- CArray Input data mapped to PCA space. Examples -------- >>> from secml.array import CArray >>> from secml.ml.features.reduction import CPCA >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]]) >>> pca = CPCA().fit(array) >>> pca.transform(CArray.concatenate(array, [4., 2., -6.], axis=0)) CArray(4, 3)(dense: [[-4.078722e+00 -2.478266e+00 1.855417e-17] [-2.727232e+00 2.829603e+00 6.708859e-17] [ 6.805954e+00 -3.513362e-01 -1.757349e-16] [ 3.209152e+00 1.129680e+00 3.296909e+00]]) >>> pca.transform([4., 2.]) Traceback (most recent call last): ... ValueError: array to transform must have 3 features (columns). """ data_carray = CArray(x).todense().atleast_2d() if data_carray.shape[1] != self.mean.size: raise ValueError("array to transform must have {:} " "features (columns).".format(self.mean.size)) out = CArray((data_carray - self.mean).dot(self._components.T)) return out.atleast_2d() if x.ndim >= 2 else out def _inverse_transform(self, x): """Map data back to its original space. Parameters ---------- x : CArray Array to transform back to its original space. Returns -------- CArray Input array mapped back to its original space. Examples -------- >>> from secml.array import CArray >>> from secml.ml.features.reduction import CPCA >>> array = CArray([[1., 0., 2.], [2., 5., 0.], [0., 1., -9.]]) >>> pca = CPCA().fit(array) >>> array_pca = pca.transform(array) >>> pca.inverse_transform(array_pca).round(6) CArray(3, 3)(dense: [[ 1. -0. 2.] [ 2. 5. -0.] [-0. 1. -9.]]) """ data_carray = CArray(x).atleast_2d() if data_carray.shape[1] != self.n_components: raise ValueError("array to revert must have {:} " "features (columns).".format(self.n_components)) out = CArray(data_carray.dot(self._components) + self.mean) return out.atleast_2d() if x.ndim >= 2 else out