Source code for secml.data.splitter.c_datasplitter_shuffle

"""
.. module:: CDataSplitterShuffle
   :synopsis: Random permutation splitting

.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>
.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from sklearn.model_selection import ShuffleSplit

from secml.array import CArray
from secml.data.splitter import CDataSplitter


[docs]class CDataSplitterShuffle(CDataSplitter): """Random permutation dataset splitting. Yields indices to split data into training and test sets. Note: contrary to other dataset splitting strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Parameters ---------- num_folds : int, optional Number of folds to create. Default 3. This correspond to the size of tr_idx and ts_idx lists. train_size : float, int, or None, optional If None (default), the value is automatically set to the complement of the test size. If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. test_size : float, int, or None, optional If None (default), the value is automatically set to the complement of the train size. If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'shuffle' Notes ----- train_size and test_size could not be both None. If one is set to None the other should be a float, representing a percentage, or an integer. Examples -------- >>> from secml.data import CDataset >>> from secml.data.splitter import CDataSplitterShuffle >>> ds = CDataset([[1,2],[3,4],[5,6]],[1,0,1]) >>> shuffle = CDataSplitterShuffle(num_folds=3, train_size=0.5, random_state=0).compute_indices(ds) >>> shuffle.num_folds 3 >>> shuffle.tr_idx [CArray(1,)(dense: [0]), CArray(1,)(dense: [1]), CArray(1,)(dense: [1])] >>> shuffle.ts_idx [CArray(2,)(dense: [2 1]), CArray(2,)(dense: [2 0]), CArray(2,)(dense: [0 2])] >>> # Setting the train_size or the test_size to an arbitrary percentage >>> shuffle = CDataSplitterShuffle(num_folds=3, train_size=0.2, random_state=0).compute_indices(ds) >>> shuffle.num_folds 3 >>> shuffle.tr_idx [CArray(0,)(dense: []), CArray(0,)(dense: []), CArray(0,)(dense: [])] >>> shuffle.ts_idx [CArray(3,)(dense: [2 1 0]), CArray(3,)(dense: [2 0 1]), CArray(3,)(dense: [0 2 1])] """ __class_type = 'shuffle' def __init__(self, num_folds=3, train_size=None, test_size=None, random_state=None): super(CDataSplitterShuffle, self).__init__( num_folds=num_folds, random_state=random_state) self.train_size = train_size self.test_size = test_size
[docs] def compute_indices(self, dataset): """Compute training set and test set indices for each fold. Parameters ---------- dataset : CDataset Dataset to split. Returns ------- CDataSplitter Instance of the dataset splitter with tr/ts indices. """ # Resetting indices self._tr_idx = [] self._ts_idx = [] sk_splitter = ShuffleSplit(n_splits=self.num_folds, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state) # We take sklearn indices (iterators) and map to list of CArrays for train_index, test_index in \ sk_splitter.split(dataset.X.get_data()): train_index = CArray(train_index) test_index = CArray(test_index) self._tr_idx.append(train_index) self._ts_idx.append(test_index) return self