Source code for secml.data.splitter.c_datasplitter_shuffle

"""
.. module:: CDataSplitterShuffle
   :synopsis: Random permutation splitting

.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>
.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from sklearn.model_selection import ShuffleSplit

from secml.array import CArray
from secml.data.splitter import CDataSplitter


[docs]class CDataSplitterShuffle(CDataSplitter):
    """Random permutation dataset splitting.

    Yields indices to split data into training and test sets.

    Note: contrary to other dataset splitting strategies, random splits
    do not guarantee that all folds will be different, although this is
    still very likely for sizeable datasets.

    Parameters
    ----------
    num_folds : int, optional
        Number of folds to create. Default 3.
        This correspond to the size of tr_idx and ts_idx lists.
    train_size : float, int, or None, optional
        If None (default), the value is automatically set to the
        complement of the test size. If float, should be between
        0.0 and 1.0 and represent the proportion of the dataset
        to include in the train split. If int, represents the
        absolute number of train samples.
    test_size : float, int, or None, optional
        If None (default), the value is automatically set to the
        complement of the train size. If float, should be between
        0.0 and 1.0 and represent the proportion of the dataset
        to include in the test split. If int, represents the
        absolute number of test samples.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'shuffle'

    Notes
    -----
    train_size and test_size could not be both None. If one is
    set to None the other should be a float, representing a
    percentage, or an integer.

    Examples
    --------
    >>> from secml.data import CDataset
    >>> from secml.data.splitter import CDataSplitterShuffle

    >>> ds = CDataset([[1,2],[3,4],[5,6]],[1,0,1])
    >>> shuffle = CDataSplitterShuffle(num_folds=3, train_size=0.5, random_state=0).compute_indices(ds)
    >>> shuffle.num_folds
    3
    >>> shuffle.tr_idx
    [CArray(1,)(dense: [0]), CArray(1,)(dense: [1]), CArray(1,)(dense: [1])]
    >>> shuffle.ts_idx
    [CArray(2,)(dense: [2 1]), CArray(2,)(dense: [2 0]), CArray(2,)(dense: [0 2])]

    >>> # Setting the train_size or the test_size to an arbitrary percentage
    >>> shuffle = CDataSplitterShuffle(num_folds=3, train_size=0.2, random_state=0).compute_indices(ds)
    >>> shuffle.num_folds
    3
    >>> shuffle.tr_idx
    [CArray(0,)(dense: []), CArray(0,)(dense: []), CArray(0,)(dense: [])]
    >>> shuffle.ts_idx
    [CArray(3,)(dense: [2 1 0]), CArray(3,)(dense: [2 0 1]), CArray(3,)(dense: [0 2 1])]

    """
    __class_type = 'shuffle'

    def __init__(self,  num_folds=3, train_size=None,
                 test_size=None, random_state=None):

        super(CDataSplitterShuffle, self).__init__(
            num_folds=num_folds, random_state=random_state)

        self.train_size = train_size
        self.test_size = test_size

[docs]    def compute_indices(self, dataset):
        """Compute training set and test set indices for each fold.

        Parameters
        ----------
        dataset : CDataset
            Dataset to split.

        Returns
        -------
        CDataSplitter
            Instance of the dataset splitter with tr/ts indices.

        """
        # Resetting indices
        self._tr_idx = []
        self._ts_idx = []

        sk_splitter = ShuffleSplit(n_splits=self.num_folds,
                                   train_size=self.train_size,
                                   test_size=self.test_size,
                                   random_state=self.random_state)

        # We take sklearn indices (iterators) and map to list of CArrays
        for train_index, test_index in \
                sk_splitter.split(dataset.X.get_data()):
            train_index = CArray(train_index)
            test_index = CArray(test_index)
            self._tr_idx.append(train_index)
            self._ts_idx.append(test_index)

        return self