Source code for secml.data.splitter.c_datasplitter

"""
.. module:: CDataSplitter
   :synopsis: Common interface for dataset splitting

.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>
.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from abc import ABCMeta, abstractmethod

from secml.core import CCreator


[docs]class CDataSplitter(CCreator, metaclass=ABCMeta):
    """Abstract class that defines basic methods for dataset splitting.

    Parameters
    ----------
    num_folds : int, optional
        Number of folds to create. Default 3.
        This corresponds to the size of tr_idx and ts_idx lists.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    """
    __super__ = 'CDataSplitter'

    def __init__(self, num_folds=3, random_state=None):

        self.num_folds = num_folds
        self.random_state = random_state

        self._tr_idx = []  # Training set indices for each fold
        self._ts_idx = []  # Test set indices for each fold

    @property
    def tr_idx(self):
        """List of training idx obtained with the split of the data."""
        return self._tr_idx

    @property
    def ts_idx(self):
        """List of test idx obtained with the split of the data."""
        return self._ts_idx

    def __iter__(self):
        """Return a train/test indices pair for each fold."""
        for f in range(self.num_folds):
            yield self._tr_idx[f], self._ts_idx[f]

[docs]    @abstractmethod
    def compute_indices(self, dataset):
        """Compute training set and test set indices for each fold.

        Parameters
        ----------
        dataset : CDataset
            Dataset to split.

        Returns
        -------
        CDataSplitter
            Instance of the dataset splitter with tr/ts indices.

        """
        raise NotImplementedError("Each data splitting algorithm must define "
                                  "a `compute_indices` method.")

[docs]    def split(self, dataset):
        """Returns a list of split datasets.

        Parameters
        ----------
        dataset : CDataset
            Dataset to split.

        Returns
        -------
        split_ds : list of tuple
            List of tuples (training set, test set), one for each fold.

        """
        # Computing splitting indices
        self.compute_indices(dataset)

        # For each fold, return a tuple (training set, test set)
        ds_list = []
        for tr_idx, ts_idx in self:
            ds_list.append((dataset[tr_idx, :], dataset[ts_idx, :]))

        return ds_list