Source code for secml.data.splitter.c_datasplitter_labelkfold

"""
.. module:: CDataSplitterLabelKFold
   :synopsis: Label K-Fold splitting

.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from secml.array import CArray
from secml.data.splitter import CDataSplitter


[docs]class CDataSplitterLabelKFold(CDataSplitter):
    """K-Folds dataset splitting with non-overlapping labels.

    The same label will not appear in two different folds
    (the number of distinct labels has to be at least equal
    to the number of folds).

    The folds are approximately balanced in the sense that the
    number of distinct labels is approximately the same in each fold.

    Parameters
    ----------
    num_folds : int, optional
        Number of folds to create. Default 3.
        This correspond to the size of tr_idx and ts_idx lists.

    Attributes
    ----------
    class_type : 'label-kfold'

    Examples
    --------
    >>> from secml.data import CDataset
    >>> from secml.data import CDataset
    >>> from secml.data.splitter import CDataSplitterLabelKFold
    >>> ds = CDataset([[1,2],[3,4],[5,6],[7,8]], [1,0,1,2])
    >>> kfold = CDataSplitterLabelKFold(num_folds=3).compute_indices(ds)
    >>> print(kfold.num_folds)
    3
    >>> print(kfold.tr_idx)
    [CArray(2,)(dense: [1 3]), CArray(3,)(dense: [0 1 2]), CArray(3,)(dense: [0 2 3])]
    >>> print(kfold.ts_idx)
    [CArray(2,)(dense: [0 2]), CArray(1,)(dense: [3]), CArray(1,)(dense: [1])]

    """
    __class_type = 'label-kfold'

    def __init__(self, num_folds=3):

        super(CDataSplitterLabelKFold, self).__init__(num_folds=num_folds)

[docs]    def compute_indices(self, dataset):
        """Compute training set and test set indices for each fold.

        Parameters
        ----------
        dataset : CDataset
            Dataset to split.

        Returns
        -------
        CDataSplitter
            Instance of the dataset splitter with tr/ts indices.

        """
        # Resetting indices
        self._tr_idx = []
        self._ts_idx = []

        unique_labels, labels = dataset.Y.unique(return_inverse=True)
        n_labels = unique_labels.size

        if self.num_folds > n_labels:
            raise ValueError(
                    ("Cannot have number of folds ({0}) greater"
                     " than the number of classes: {1}.").format(
                        self.num_folds, n_labels))

        # Weight labels by their number of occurrences
        n_samples_per_label = labels.bincount()

        # Distribute the most frequent labels first
        indices = n_samples_per_label.argsort(axis=None)[::-1]
        n_samples_per_label = n_samples_per_label[indices]

        # Total weight of each fold
        n_samples_per_fold = CArray.zeros(self.num_folds, dtype=int)

        # Mapping from label index to fold index
        label_to_fold = CArray.zeros(n_labels, dtype=int)

        # Distribute samples by adding the largest weight to the lightest fold
        for label_index, weight in enumerate(n_samples_per_label):
            lightest_fold = n_samples_per_fold.argmin()
            n_samples_per_fold[lightest_fold] += weight
            label_to_fold[indices[label_index]] = lightest_fold

        fold_labels = label_to_fold[labels]

        for fold_idx in range(self.num_folds):
            test_indices = fold_labels.find(fold_labels == fold_idx)
            train_indices = fold_labels.find(fold_labels != fold_idx)
            self._ts_idx.append(CArray(test_indices))
            self._tr_idx.append(CArray(train_indices))

        return self