Source code for secml.data.c_dataset_header

"""
.. module:: CDatasetHeader
   :synopsis: Header with extra dataset attributes.

.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from secml.core import CCreator
from secml.core.attr_utils import is_writable
from secml.core.type_utils import is_list
from secml.array import CArray


[docs]class CDatasetHeader(CCreator): """Creates a new dataset header. Parameters to be included into the header could be defined as keyword init arguments or by setting them as new public header attributes. Immutable objects (scalar, string, tuple, dictionary) will be passed as they are while indexing the header. Arrays will be indexed and the result of indexing will be returned. To extract a dictionary with the entire set of attributes, use `.get_params()`. Parameters ---------- kwargs : any, optional Any extra attribute of the dataset. Could be an immutable object (scalar, tuple, dict, str), or a vector-like CArray. Lists are automatically converted to vector-like CArrays. Examples -------- >>> from secml.data import CDatasetHeader >>> from secml.array import CArray >>> ds_header = CDatasetHeader(id='mydataset', colors=CArray([1,2,3])) >>> print(ds_header.id) mydataset >>> print(ds_header.colors) CArray([1 2 3]) >>> ds_header.age = 32 >>> print(ds_header.age) 32 """ __super__ = 'CDatasetHeader' __class_type = 'standard' def __init__(self, **kwargs): self._num_samples = None # Will be populated by `._validate_params()` # Set each optional arg for key, value in kwargs.items(): setattr(self, key, value) @property def num_samples(self): """The number of samples for which the header defines extra params.""" return self._num_samples def __setattr__(self, key, value): """Add a new attribute to the header. Parameters ---------- key : str Attribute to set. value : any Value to assign to the attribute. Could be an immutable object (scalar, tuple, dict, str), or a vector-like CArray. Lists are automatically converted to vector-like CArrays. """ # We store lists as CArrays to facilitate indexing value = CArray(value) if is_list(value) else value # Make sure we store arrays as vector-like value = value.ravel() if isinstance(value, CArray) else value super(CDatasetHeader, self).__setattr__(key, value) # Make sure that input writable attributes are consistent if is_writable(self, key): self._validate_params() def _validate_params(self): """Validate input attributes. The following checks will be performed: - all CArray must have the same size """ for attr_k, attr_v in self.get_params().items(): if isinstance(attr_v, CArray): if self.num_samples is not None: if attr_v.size != self.num_samples: delattr(self, attr_k) # Remove faulty attribute raise ValueError( "`{:}` is an array of size {:}. " "{:} expected.".format(attr_k, attr_v.size, self.num_samples)) # Populate the protected _num_samples attribute self._num_samples = attr_v.size def __getitem__(self, idx): """Given an index, extract the header subset. Immutable objects (scalar, string, tuple, dictionary) will be passed as they are while indexing the header. Arrays will be indexed and the result of indexing will be returned. Examples -------- >>> from secml.data import CDatasetHeader >>> from secml.array import CArray >>> ds_header = CDatasetHeader(id='mydataset', age=CArray([1,2,3])) >>> h_subset = ds_header[[0, 2]] >>> h_subset.id 'mydataset' >>> h_subset.age CArray(2,)(dense: [1 3]) """ subset = dict() for attr in self.get_params(): if isinstance(getattr(self, attr), CArray): subset[attr] = getattr(self, attr)[idx] else: # Pass other types (dict, scalar, str, ...) as is subset[attr] = getattr(self, attr) return self.__class__(**subset) def __str__(self): if len(self.get_params()) == 0: return self.__class__.__name__ + "{}" return self.__class__.__name__ + \ "{'" + "', '".join(self.get_params()) + "'}"
[docs] def append(self, header): """Append input header to current header. Parameters ---------- header : CDatasetHeader Header to append. Only attributes which are arrays are merged. Other attributes are set if not already defined in the current header. Otherwise, the value of the attributes in the input header should be equal to the value of the same attribute in the current header. Returns ------- CDatasetHeader Notes ----- Append does not occur in-place: a new header is allocated and filled. See Also -------- CArray.append : More information about arrays append. Examples -------- >>> from secml.data import CDatasetHeader >>> from secml.array import CArray >>> ds_header1 = CDatasetHeader(id={'a': 0, 'b': 2}, a=2, age=CArray([1,2,3])) >>> ds_header2 = CDatasetHeader(id={'a': 0, 'b': 2}, b=4, age=CArray([1,2,3])) >>> ds_merged = ds_header1.append(ds_header2) >>> ds_merged.age CArray(6,)(dense: [1 2 3 1 2 3]) >>> ds_merged.id # doctest: +SKIP {'a': 0, 'b': 2} >>> ds_merged.a 2 >>> ds_merged.b 4 """ subset = dict() for attr in header.get_params(): if hasattr(self, attr): # Attribute already in current header if isinstance(getattr(self, attr), CArray): subset[attr] = getattr(self, attr)\ .append(getattr(header, attr)) elif getattr(self, attr) != getattr(header, attr): # For not-arrays, we check equality raise ValueError( "value of '{:}' in input header should be equal " "to '{:}'".format(attr, getattr(self, attr))) else: # New attribute in input header subset[attr] = getattr(header, attr) # Append attributes which are not in the input header for attr in self.get_params(): if attr not in subset: subset[attr] = getattr(self, attr) return self.__class__(**subset)