Source code for secml.data.splitter.c_chronological_splitter

"""
.. module:: CChronologicalSplitter
   :synopsis: Dataset splitter based on timestamps

.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from dateutil import parser
from datetime import datetime

from secml.core import CCreator
from secml.core.type_utils import is_int, is_float
from secml.array import CArray
from secml.data import CDataset


[docs]class CChronologicalSplitter(CCreator): """Dataset splitter based on timestamps. Split dataset into train and test subsets, using a timestamp as split point. A dataset containing `timestamp` and `timestamp_fmt` header attributes is required. Parameters ---------- th_timestamp : str The split point in time between training and test set. Samples having `timestamp <= th_timestamp` will be put in the training set, while samples with `timestamp > th_timestamp` will be used for the test set. The timestamp must follow the ISO 8601 format. Any incomplete timestamp will be parsed too. train_size : float or int, optional If float, should be between 0.0 and 1.0 and represent the proportion of the samples having `timestamp <= th_timestamp` to include in the train split. Default 1.0. If int, represents the absolute number of train samples. test_size : float or int, optional If float, should be between 0.0 and 1.0 and represent the proportion of the samples having `timestamp > th_timestamp` to include in the test split. Default 1.0. If int, represents the absolute number of test samples. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. shuffle : bool, optional Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. Default True. """ def __init__(self, th_timestamp, train_size=1.0, test_size=1.0, random_state=None, shuffle=True): if (is_float(test_size) and (test_size <= 0 or test_size > 1.0)) or \ (is_float(train_size) and (train_size <= 0 or train_size > 1.0)): raise ValueError("`test_size` and `train_size` " "must be between (0 and 1.0] if float") # We use dateutil.parser is order to allow incomplete # timestamps (e.g. a single year '2016') self.th_timestamp = parser.isoparse(th_timestamp) self.train_size = train_size self.test_size = test_size self.random_state = random_state self.shuffle = shuffle self._tr_idx = None # Training set indices self._ts_idx = None # Test set indices @property def tr_idx(self): """Training set indices obtained with the split of the data.""" return self._tr_idx @property def ts_idx(self): """Test set indices obtained with the split of the data.""" return self._ts_idx
[docs] def compute_indices(self, dataset): """Compute training set and test set indices. Parameters ---------- dataset : CDataset Dataset to split. Returns ------- tr_idx, ts_idx : CArray Flat arrays with the tr/ts indices. """ if not hasattr(dataset.header, 'timestamp') or \ not hasattr(dataset.header, 'timestamp_fmt'): raise AttributeError("dataset must contain `timestamp` and " "'timestamp_fmt' information") timestamps = dataset.header.timestamp fmt = dataset.header.timestamp_fmt # Pick the samples having `timestamp <= th` to build the training set tr_mask = CArray(list(map( lambda tstmp: datetime.strptime(tstmp, fmt) <= self.th_timestamp, timestamps))) # Test set samples are all the other samples ts_mask = tr_mask.logical_not() # Compute the number of train/test samples max_tr = tr_mask.sum() max_ts = dataset.num_samples - max_tr if max_tr == 0: raise ValueError("no samples with timestamp <= {:}. " "Cannot split dataset.".format(self.th_timestamp)) if max_ts == 0: raise ValueError("no samples with timestamp > {:}. " "Cannot split dataset.".format(self.th_timestamp)) # Compute the actual number of desired train/test samples if is_int(self.train_size): if self.train_size < 1 or self.train_size > max_tr: raise ValueError( "train_size should be between 1 and {:}".format(max_tr)) else: # train_size is a valid integer, use it directly tr_size = self.train_size else: # Compute the proportion of train samples (at least 1) tr_size = int(max(1, round(max_tr * self.train_size))) if is_int(self.test_size): if self.test_size < 1 or self.test_size > max_ts: raise ValueError( "test_size should be between 1 and {:}".format(max_ts)) else: # test_size is a valid integer, use it directly ts_size = self.test_size else: # Compute the proportion of train samples (at least 1) ts_size = int(max(1, round(max_ts * self.test_size))) # Get the indices of samples from boolean masks tr_idx = CArray(tr_mask.find(tr_mask)) ts_idx = CArray(ts_mask.find(ts_mask)) # Get the subset of indices to include in train/test set # If shuffle is True, randomize the indices if self.shuffle is True: tr_idx = CArray.randsample( tr_idx, shape=(tr_size, ), random_state=self.random_state) ts_idx = CArray.randsample( ts_idx, shape=(ts_size, ), random_state=self.random_state) else: # Just slice the arrays of indices tr_idx = tr_idx[:tr_size] ts_idx = ts_idx[:ts_size] self._tr_idx = tr_idx self._ts_idx = ts_idx return self.tr_idx, self.ts_idx
[docs] def split(self, dataset): """Split dataset into training set and test set. Parameters ---------- dataset : CDataset Dataset to split. Returns ------- ds_train, ds_test : CDataset Train and Test datasets. """ if not hasattr(dataset.header, 'timestamp') or \ not hasattr(dataset.header, 'timestamp_fmt'): raise AttributeError("dataset must contain `timestamp` and " "'timestamp_fmt' information") # Computing splitting indices tr_idx, ts_idx = self.compute_indices(dataset) return dataset[tr_idx, :], dataset[ts_idx, :]