Source code for secml.data.c_dataset

"""
.. module:: CDataset
   :synopsis: A dataset with an array of patterns and corresponding labels

.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from secml.core import CCreator
from secml.array import CArray
from secml.data import CDatasetHeader
from secml.data.data_utils import label_binarize_onehot


[docs]class CDataset(CCreator): """Creates a new dataset. A dataset consists in a 2-Dimensional patterns array, dense or sparse format, with one pattern for each row and a flat dense array with each pattern's label. Parameters ---------- x : `array_like` or CArray Dataset patterns, one for each row. Array is converted to 2-Dimensions before storing. y : `array_like` or CArray Dataset labels. Array is converted to dense format and flattened before storing. header : CDatasetHeader or None, optional The header for the dataset. Will define any extra parameter. See `CDatasetHeader` docs for more information. Examples -------- >>> from secml.data import CDataset >>> ds = CDataset([[1,2],[3,4],[5,6]],[1,0,1]) >>> print(ds.X) CArray([[1 2] [3 4] [5 6]]) >>> print(ds.Y) CArray([1 0 1]) >>> ds = CDataset([1,2,3],1) # Patterns will be converted to 2-Dims >>> print(ds.X) CArray([[1 2 3]]) >>> print(ds.Y) CArray([1]) >>> from secml.array import CArray >>> ds = CDataset(CArray([[1,0],[0,4],[1,0]],tosparse=True), CArray([1,0,1],tosparse=True)) >>> print(ds.X) # doctest: +NORMALIZE_WHITESPACE CArray( (0, 0) 1 (1, 1) 4 (2, 0) 1) >>> print(ds.Y) CArray([1 0 1]) The number of labels must be equal to the number of samples >>> ds = CDataset([[1,2],[3,4]],1) Traceback (most recent call last): ... ValueError: number of labels (1) must be equal to the number of samples (2). >>> from secml.data import CDatasetHeader >>> ds = CDataset([1,2,3], 1, CDatasetHeader(id='mydataset', age=34)) # 2 extra attributes >>> print(ds.header.id) mydataset >>> print(ds.header.age) 34 """ __super__ = 'CDataset' __class_type = 'standard' def __init__(self, x, y, header=None): # Default placeholders (keep to make _check_samples_labels work) self._X = None self._Y = None # Patterns are forced to be 2-D (one row for each pattern) self.X = x # Labels are 1-D (one label for each sample) # This will also check patterns/labels size consistency self.Y = y # Header that will store extra attributes of dataset self.header = header def __setstate__(self, state): """Reset CDataset instance after unpickling.""" self.__dict__.update(state) # Initialize header placeholder if not available # Necessary to unpickle old dataset (stored with secml < v0.6) if not hasattr(self, '_header'): self._header = None @property def X(self): """Dataset Patterns.""" return self._X @X.setter def X(self, value): """Set Dataset Patterns. Parameters ---------- value : `array_like` or CArray Array containing patterns. Data is converted to 2-Dimensions before storing. """ x = CArray(value).atleast_2d() if self.Y is not None: # Checking number of samples/labels equality self._check_samples_labels(x=x) self._X = x @property def Y(self): """Dataset Labels.""" return self._Y @Y.setter def Y(self, value): """Set Dataset Labels. Parameters ---------- value : `array_like` or CArray Array containing labels. Array is converted to dense format and flattened before storing. """ y = CArray(value).todense().ravel() if self._X is not None: # Checking number of samples/labels equality self._check_samples_labels(y=y) self._Y = y @property def header(self): """Dataset header.""" return self._header @header.setter def header(self, value): """Dataset header.""" if value is not None: if not isinstance(value, CDatasetHeader): raise TypeError( "'header' must be an instance of 'CDatasetHeader'") # Check if header is compatible (same num_samples) if value.num_samples is not None and \ self.num_samples != value.num_samples: raise ValueError( "incompatible header size {:}. {:} expected.".format( self.num_samples, value.num_samples)) self._header = value @property def num_samples(self): """Number of patterns.""" return self.X.shape[0] @property def num_features(self): """Number of features.""" return self.X.shape[1] @property def num_labels(self): """Returns dataset's number of labels.""" return self.Y.size @property def classes(self): """Classes (unique).""" return self.Y.unique() @property def num_classes(self): """Number of classes.""" return self.classes.size @property def issparse(self): """True if patterns are stored in sparse format, else False.""" return self.X.issparse @property def isdense(self): """True if patterns are stored in dense format, else False.""" return self.X.isdense def _check_samples_labels(self, x=None, y=None): """Raise ValueError if the number of labels is different to the number of samples.""" x = self.X if x is None else x y = self.Y if y is None else y if x.shape[0] != y.size: raise ValueError( "number of labels ({:}) must be equal to the number " "of samples ({:}).".format(y.size, x.shape[0])) def __getitem__(self, idx): """Given an index, get the corresponding X and Y elements.""" if not isinstance(idx, tuple) or len(idx) != self.X.ndim: raise IndexError( "{:} sequences are required for indexing.".format(self.X.ndim)) y = self.Y.__getitem__([idx[0] if isinstance(idx, tuple) else idx][0]) header = None if self.header is not None: header = self.header.__getitem__( [idx[0] if isinstance(idx, tuple) else idx][0]) return self.__class__(self.X.__getitem__(idx), y, header=header) def __setitem__(self, idx, data): """Given an index, set the corresponding X and Y elements.""" if not isinstance(data, self.__class__): raise TypeError("dataset can be set only using another dataset.") if not isinstance(idx, tuple) or len(idx) != self.X.ndim: raise IndexError( "{:} sequences are required for indexing.".format(self.X.ndim)) self.X.__setitem__(idx, data.X) # We now set the labels corresponding to set patterns self.Y.__setitem__([idx[0] if isinstance(idx, tuple) else idx][0], data.Y)
[docs] def append(self, dataset): """Append input dataset to current dataset. Parameters ---------- dataset : CDataset Dataset to append. Patterns are appended on first axis (axis=0) so the number of features must be equal to dataset.num_features. If current dataset format is sparse, dense dataset to append will be converted to sparse and vice-versa. Returns ------- CDataset A new Dataset resulting of appending new data to existing data. Format of resulting dataset is equal to current dataset format. Notes ----- Append does not occur in-place: a new dataset is allocated and filled. See Also -------- :meth:`CArray.append` : More information about arrays append. Examples -------- >>> from secml.data import CDataset >>> ds = CDataset([[1,2],[3,4],[5,6]],[1,0,1]) >>> ds_new = ds.append(CDataset([[10,20],[30,40],[50,60]],[1,0,1])) >>> print(ds_new.X) CArray([[ 1 2] [ 3 4] [ 5 6] [10 20] [30 40] [50 60]]) >>> print(ds_new.Y) CArray([1 0 1 1 0 1]) >>> ds_new = ds.append(CDataset([[10,20],[30,40],[50,60]],[1,0,1]).tosparse()) >>> print(ds_new.X) CArray([[ 1 2] [ 3 4] [ 5 6] [10 20] [30 40] [50 60]]) >>> print(ds_new.Y) CArray([1 0 1 1 0 1]) """ # Format conversion and error checking is managed by CArray.append() new_labels = self.Y.append(dataset.Y) # As the input dataset (or self) could have no header, check it if dataset.header is None or self.header is None: new_header = self.header or dataset.header if new_header is not None: if new_header.num_samples is not None: raise ValueError( "cannot append a dataset with header and " "{:} samples as the other has no header. " "Define a consistent header for both dataset " "and try again.".format(new_header.num_samples)) else: # Both input ds and self have header, merge them new_header = self.header.append(dataset.header) return self.__class__( self.X.append(dataset.X, axis=0), new_labels, header=new_header)
[docs] def tosparse(self): """Convert dataset's patterns to sparse format. Returns ------- CDataset A new CDataset with same patterns converted to sparse format. Copy is avoided if possible. Examples -------- >>> from secml.data import CDataset >>> ds = CDataset([[1,2],[3,4],[5,6]],[1,0,1]).tosparse() >>> print(ds.X) # doctest: +NORMALIZE_WHITESPACE CArray( (0, 0) 1 (0, 1) 2 (1, 0) 3 (1, 1) 4 (2, 0) 5 (2, 1) 6) >>> print(ds.Y) CArray([1 0 1]) """ return self.__class__(self.X.tosparse(), self.Y, header=self.header)
[docs] def todense(self): """Convert dataset's patterns to dense format. Returns ------- CDataset A new CDataset with same patterns converted to dense format. Copy is avoided if possible. Examples -------- >>> from secml.data import CDataset >>> ds = CDataset(CArray([[1,2],[3,4],[5,6]], tosparse=True),[1,0,1]).todense() >>> print(ds.X) CArray([[1 2] [3 4] [5 6]]) """ return self.__class__(self.X.todense(), self.Y, header=self.header)
[docs] def get_labels_ovr(self, pos_label): """Return dataset labels in one-vs-rest encoding. Parameters ---------- pos_label : scalar or str Label of the class to consider as positive. Returns ------- CArray Flat array with 1 when the class label is equal to input positive class's label, else 0. Examples -------- >>> ds = CDataset([[11,22],[33,44],[55,66],[77,88]], [1,0,2,1]) >>> print(ds.get_labels_ovr(2)) CArray([0 0 1 0]) >>> print(ds.get_labels_ovr(1)) CArray([1 0 0 1]) """ # Assigning 1 for each label of positive class and 0 to all others return CArray([1 if e == pos_label else 0 for e in self.Y])
[docs] def get_labels_onehot(self): """Return dataset labels in one-hot encoding. Returns ------- binary_labels : CArray A (num_samples, num_classes) array with the dataset labels one-hot encoded. Examples -------- >>> ds = CDataset([[11,22],[33,44],[55,66],[77,88]], [1,0,2,1]) >>> print(ds.get_labels_onehot()) CArray([[0 1 0] [1 0 0] [0 0 1] [0 1 0]]) """ # Our convention is that the labels are from 0 ... N (integers only) # Do not use `self.classes` directly as Y can contain only a subset # of the known classes return label_binarize_onehot(self.Y)
[docs] def get_bounds(self, offset=0.0): """Return dataset boundaries plus an offset. Parameters ---------- offset : float Quantity to be added as an offset. Default 0. Returns ------- boundary : list of tuple Every tuple contain min and max feature value plus an offset for corresponding coordinate. Examples ---------- >>> from secml.array import CArray >>> from secml.data import CDataset >>> ds = CDataset([[1,2,3],[4,5,6]], [1,2]) >>> ds.get_bounds() [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)] """ x_min = self.X.min(axis=0) - offset x_max = self.X.max(axis=0) + offset boundary = [] for f_idx in range(self.num_features): boundary.append((x_min[0, f_idx].item(), x_max[0, f_idx].item())) return boundary