Source code for secml.ml.classifiers.sklearn.c_classifier_svm

"""
.. module:: CClassifierSVM
   :synopsis: Support Vector Machine (SVM) classifier

.. moduleauthor:: Battista Biggio <battista.biggio@unica.it>

"""
from sklearn.svm import SVC

from secml.array import CArray
from secml.ml.classifiers import CClassifier
from secml.ml.classifiers.clf_utils import convert_binary_labels
from secml.ml.kernels import CKernel
from secml.ml.classifiers.loss import CLossHinge
from secml.parallel import parfor2


def _fit_one_ova(tr_class_idx, svm, x, y, svc_kernel, verbose):
    """Fit a OVA classifier.

    Parameters
    ----------
    tr_class_idx : int
        Index of the label against which the classifier should be trained.
    svm : CClassifierSVM
        Instance of the multiclass SVM classifier.
    x : CArray
        Array to be used for training with shape (n_samples, n_features).
    y : CArray
        Array of shape (n_samples,) containing the class labels.
    verbose : int
        Verbosity level of the logger.

    """
    # Resetting verbosity level. This is needed as objects
    # change id  when passed to subprocesses and our logging
    # level is stored per-object looking to id
    svm.verbose = verbose

    svm.logger.info(
        "Training against class: {:}".format(tr_class_idx))

    # Binarize labels
    y_ova = CArray(y == svm.classes[tr_class_idx])

    # Training the one-vs-all classifier
    svc = SVC(C=svm.C, kernel=svc_kernel, class_weight=svm.class_weight)
    svc.fit(x.get_data(), y_ova.get_data())

    # Assign output based on kernel type
    w = CArray(svc.coef_.ravel()) if svm.kernel is None else None
    sv_idx = CArray(svc.support_).ravel() if svm.kernel is not None else None
    alpha = CArray(svc.dual_coef_) if svm.kernel is not None else None

    # Intercept is always available
    b = CArray(svc.intercept_[0])[0]

    return w, sv_idx, alpha, b


[docs]class CClassifierSVM(CClassifier):
    """Support Vector Machine (SVM) classifier.

    Parameters
    ----------
    C : float, optional
        Penalty hyper-parameter C of the error term. Default 1.0.
    kernel : None or CKernel subclass, optional
        Instance of a CKernel subclass to be used for computing
        similarity between patterns. If None (default), a linear
        SVM is trained in the primal; otherwise an SVM is trained in the dual,
        using the precomputed kernel values.
    class_weight : {dict, 'balanced', None}, optional
        Set the parameter C of class i to `class_weight[i] * C`.
        If not given (default), all classes are supposed to have
        weight one. The 'balanced' mode uses the values of labels to
        automatically adjust weights inversely proportional to
        class frequencies as `n_samples / (n_classes * np.bincount(y))`.
    preprocess : CModule or str or None, optional
        Features preprocess to be applied to input data.
        Can be a CPreProcess subclass or a string with the type of the
        desired preprocessor. If None, input data is used as is.
    n_jobs : int, optional
        Number of parallel workers to use for the classifier.
        Cannot be higher than processor's number of cores. Default is 1.

    Attributes
    ----------
    class_type : 'svm'

    Notes
    -----
    Current implementation relies on :class:`sklearn.svm.SVC` for
    the training step.

    See Also
    --------
    CKernel : Pairwise kernels and metrics.

    """
    __class_type = 'svm'

    _loss = CLossHinge()

    def __init__(self, C=1.0, kernel=None,
                 class_weight=None, preprocess=None, n_jobs=1):

        # calling the superclass init
        CClassifier.__init__(self, preprocess=preprocess, n_jobs=n_jobs)

        # Classifier hyperparameters
        self.C = C
        self.class_weight = class_weight

        # After-training attributes
        self._w = None
        self._b = None
        self._alpha = None
        self._sv_idx = None  # idx of SVs in TR data (only for binary SVM)

        self._kernel = None
        if kernel is not None:
            self._kernel = CKernel.create(kernel)
            # set pre-processing chain as svm <- kernel <- preprocess
            self._kernel.preprocess = self.preprocess
            self._preprocess = self._kernel

    @property
    def sv_idx(self):
        """Indices of Support Vectors within the training dataset."""
        return self._sv_idx

    @property
    def kernel(self):
        """Kernel type (None or string)."""
        return self._kernel

    @property
    def class_weight(self):
        """Weight of each training class."""
        return self._class_weight

    @class_weight.setter
    def class_weight(self, value):
        """Sets the weight of each training class.

        Parameters
        ----------
        value : {dict, 'balanced', None}
            Set the parameter C of class i to `class_weight[i] * C`.
            If None, all classes are supposed to have weight one.
            The 'auto' mode uses the values of labels to automatically
             adjust weights inversely proportional to class frequencies
             as `n_samples / (n_classes * np.bincount(y))`.

        """
        # TODO we can have one weight per class but only for OVO
        if isinstance(value, dict) and len(value) != 2:
            raise ValueError("weight of positive (+1) and negative (0) "
                             "classes only must be specified.")
        self._class_weight = value

    @property
    def w(self):
        return self._w

    @property
    def b(self):
        return self._b

    @property
    def alpha(self):
        """Signed coefficients of the SVs in the decision function."""
        return self._alpha

    @property
    def C(self):
        """Penalty parameter C of the error term."""
        return self._C

    @C.setter
    def C(self, value):
        """Set the penalty parameter C of the error term.

        Parameters
        ----------
        value : float
            Penalty parameter C of the error term.

        """
        self._C = float(value)

    def _fit(self, x, y):
        """Trains the One-Vs-All SVM classifier.

        Parameters
        ----------
        x : CArray
            Array to be used for training with shape (n_samples, n_features).
        y : CArray
            Array of shape (n_samples,) containing the class
            labels (2-classes only).

        Returns
        -------
        CClassifierSVM
            Trained classifier.

        """
        self.logger.info(
            "Training SVM with parameters: {:}".format(self.get_params()))

        # reset training
        self._w = None
        self._b = None
        self._alpha = None
        self._sv_idx = None

        # shape of w or alpha
        n_rows = self.n_classes if self.n_classes > 2 else 1
        n_cols = x.shape[1]

        # initialize params
        if self.kernel is None:
            # no kernel pre-processing, training in the primal
            svc_kernel = 'linear'
            self._w = CArray.zeros(shape=(n_rows, n_cols))
        else:
            # inputs are kernel values, training in the dual
            svc_kernel = 'precomputed'
            self._alpha = CArray.zeros(shape=(n_rows, n_cols), sparse=True)
        self._b = CArray.zeros(shape=(self.n_classes,))

        if self.n_classes > 2:
            # fit OVA
            self._fit_one_vs_all(x, y, svc_kernel)
        else:
            # fit binary
            self._fit_binary(x, y, svc_kernel)

        # remove unused support vectors from kernel
        if self.kernel is not None:  # trained in the dual
            sv = abs(self._alpha).sum(axis=0) > 0
            self.kernel.rv = self.kernel.rv[sv, :]
            self._alpha = self._alpha[:, sv]
            self._sv_idx = CArray(sv.find(sv > 0)).ravel()  # store SV indices
        return self

    def _fit_one_vs_all(self, x, y, svc_kernel):
        # ova (but we can also implement ovo - let's do separate functions)
        out = parfor2(_fit_one_ova,
                      self.n_classes, self.n_jobs,
                      self, x, y, svc_kernel, self.verbose)

        # Building results
        for i in range(self.n_classes):
            out_i = out[i]
            if self.kernel is None:
                self._w[i, :] = out_i[0]
            else:
                self._alpha[i, out_i[1]] = out_i[2]
            self._b[i] = out_i[3]

    def _fit_binary(self, x, y, svc_kernel):
        svc = SVC(C=self.C, kernel=svc_kernel, class_weight=self.class_weight)
        if svc_kernel == 'precomputed':
            # training on sparse precomputed kernels is not supported
            svc.fit(x.tondarray(), y.get_data())
        else:
            svc.fit(x.get_data(), y.get_data())
        if self.kernel is None:
            self._w = CArray(svc.coef_)
        else:
            sv_idx = CArray(svc.support_).ravel()
            self._alpha[sv_idx] = CArray(svc.dual_coef_)
        self._b = CArray(svc.intercept_[0])[0]

    def _forward(self, x):
        """Compute decision function for SVMs, proportional to the distance of
        x to the separating hyperplane.

        For non linear SVM, the kernel between input patterns and
         Support Vectors is computed and then the inner product of
         the resulting array with the alphas is calculated.

        Parameters
        ----------
        x : CArray
            Array with new patterns to classify, 2-Dimensional of shape
            (n_patterns, n_features) or (n_patterns, n_sv) if kernel is used.

        Returns
        -------
        score : CArray
            Value of the decision function for each test pattern.
            Dense flat array of shape (n_samples,) if `y` is not None,
            otherwise a (n_samples, n_classes) array.

        """
        v = self.w if self.kernel is None else self.alpha
        score = CArray(x.dot(v.T)).todense() + self.b
        if self.n_classes > 2:  # return current score matrix
            scores = score
        else:  # concatenate scores
            scores = CArray.ones(shape=(x.shape[0], self.n_classes))
            scores[:, 0] = -score.ravel().T
            scores[:, 1] = score.ravel().T
        return scores

    def _backward(self, w):
        v = self.w if self.kernel is None else self.alpha
        if self.n_classes > 2:
            return w.dot(v)
        else:
            return w[0] * -v + w[1] * v

    #  --------------- OTHER GRADIENTS ----------------

    def _sv_margin(self, tol=1e-6):
        """Return the margin support vectors."""
        if self.n_classes > 2:
            raise ValueError("SVM is not binary!")

        assert (self.kernel.rv.shape[0] == self.alpha.shape[1])

        alpha = self.alpha.todense()
        s = alpha.find(
            (abs(alpha) >= tol) *
            (abs(alpha) <= self.C - tol))
        if len(s) > 0:
            return self.kernel.rv[s, :], CArray(s)
        else:  # no margin SVs
            return None, None

    def _kernel_function(self, x, z=None):
        """Compute kernel matrix between x and z, without pre-processing."""
        # clone kernel removing rv and pre-processing
        kernel_params = self.kernel.get_params()
        kernel_params.pop('preprocess')  # detach preprocess and rv
        kernel_params.pop('rv')
        kernel_params.pop('n_jobs')  # TODO: not accepted by kernel constructor
        kernel = CKernel.create(self.kernel.class_type, **kernel_params)
        z = z if z is not None else x
        return kernel.k(x, z)

[docs]    def hessian_tr_params(self, x=None, y=None):
        """
        Hessian of the training objective w.r.t. the classifier parameters.
        """
        xs, _ = self._sv_margin()  # these points are already normalized
        s = xs.shape[0]

        H = CArray.ones(shape=(s + 1, s + 1))
        H[:s, :s] = self._kernel_function(xs)
        H[-1, -1] = 0

        return H

[docs]    def grad_f_params(self, x, y=1):
        """Derivative of the decision function w.r.t. alpha and b

        Parameters
        ----------
        x : CArray
            Samples on which the training objective is computed.
        y : int
            Index of the class wrt the gradient must be computed.

        """
        xs, _ = self._sv_margin()  # these points are already preprocessed

        if xs is None:
            self.logger.debug("Warning: sv_margin is empty "
                              "(all points are error vectors).")
            return None

        s = xs.shape[0]  # margin support vector
        k = x.shape[0]

        Ksk_ext = CArray.ones(shape=(s + 1, k))

        sv = self.kernel.rv  # store and recover current sv set
        self.kernel.rv = xs
        Ksk_ext[:s, :] = self.kernel.forward(x).T  # x and xs are preprocessed
        self.kernel.rv = sv

        return convert_binary_labels(y) * Ksk_ext  # (s + 1) * k

[docs]    def grad_loss_params(self, x, y, loss=None):
        """
        Derivative of the loss w.r.t. the classifier parameters (alpha, b)

        dL / d_params = dL / df * df / d_params

        Parameters
        ----------
        x : CArray
            Features of the dataset on which the loss is computed.
        y : CArray
            Labels of the training samples.
        loss: None (default) or CLoss
            If the loss is equal to None (default) the classifier loss is used
            to compute the derivative.

        """
        if loss is None:
            loss = self._loss

        # compute the loss derivative w.r.t. alpha
        f_params = self.grad_f_params(x)  # (s + 1) * n_samples
        scores = self.decision_function(x)
        dL_s = loss.dloss(y, score=scores).atleast_2d()
        dL_params = dL_s * f_params  # (s + 1) * n_samples
        grad = self.C * dL_params
        return grad

[docs]    def grad_tr_params(self, x, y):
        """Derivative of the classifier training objective w.r.t.
        the classifier parameters.

        dL / d_params = dL / df * df / d_params + dReg / d_params

        Parameters
        ----------
        x : CArray
            Features of the dataset on which the loss is computed.
        y : CArray
            Features of the training samples

        """
        grad = self.grad_loss_params(x, y)  # (s+1) * n_samples

        # compute the regularizer derivative w.r.t alpha
        xs, idx = self._sv_margin()
        k = self._kernel_function(xs)
        d_reg = 2 * k.dot(self.alpha[idx].T)  # s * 1

        # add the regularizer to the gradient of the alphas
        s = idx.size
        grad[:s, :] += d_reg
        return grad  # (s+1) * n_samples