Source code for secml.data.loader.c_dataloader_sklearn

"""
.. module:: CDataLoaderSklearn
   :synopsis: Collection of dataset loaders from sklearn library.

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from multiprocessing import Lock

from abc import ABCMeta, abstractmethod

from secml.data.loader import CDataLoader
from secml.data import CDataset
from secml.array import CArray

__all__ = ['CDLRandom', 'CDLRandomRegression',
           'CDLRandomBlobs', 'CDLRandomBlobsRegression',
           'CDLRandomCircles', 'CDLRandomCircleRegression',
           'CDLRandomMoons', 'CDLRandomBinary',
           'CDLIris', 'CDLDigits', 'CDLBoston', 'CDLDiabetes']


[docs]class CDLRandom(CDataLoader): """Class for loading random data. Generate a random n-class classification problem. This initially creates clusters of points normally distributed (std=1) about vertices of a 2 * class_sep-sided hypercube, and assigns an equal number of clusters to each class. It introduces interdependence between these features and adds various types of further noise to the data. Prior to shuffling, X stacks a number of these primary "informative" features, "redundant" linear combinations of these, "repeated" duplicates of sampled features, and arbitrary noise for and remaining features. Parameters ---------- n_samples : int, optional (default=100) The number of samples. n_features : int, optional (default=20) The total number of features. These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and ``n_features - n_informative - n_redundant - n_repeated`` useless features drawn at random. n_informative : int, optional (default=2) The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then placed on the vertices of the hypercube. n_redundant : int, optional (default=2) The number of redundant features. These features are generated as random linear combinations of the informative features. n_repeated : int, optional (default=0) The number of duplicated features, drawn randomly from the informative and the redundant features. n_classes : int, optional (default=2) The number of classes (or labels) of the classification problem. n_clusters_per_class : int, optional (default=2) The number of clusters per class. weights : list of floats or None (default=None) The proportions of samples assigned to each class. If None, then classes are balanced. Note that if ``len(weights) == n_classes - 1``, then the last class weight is automatically inferred. More than n_samples samples may be returned if the sum of weights exceeds 1. flip_y : float, optional (default=0.01) The fraction of samples whose class are randomly exchanged. class_sep : float, optional (default=1.0) The factor multiplying the hypercube dimension. hypercube : bool, optional (default=True) If True, the clusters are put on the vertices of a hypercube. If False, the clusters are put on the vertices of a random polytope. shift : float, array of shape [n_features] or None, optional (default=0.0) Shift features by the specified value. If None, then features are shifted by a random value drawn in [-class_sep, class_sep]. scale : float, array of shape [n_features] or None, optional (default=1.0) Multiply features by the specified value. If None, then features are scaled by a random value drawn in [1, 100]. Note that scaling happens after shifting. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'classification' """ __class_type = 'classification' def __init__(self, n_samples=100, n_features=20, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, random_state=None): self.n_samples = n_samples self.n_features = n_features self.n_informative = n_informative self.n_redundant = n_redundant self.n_repeated = n_repeated self.n_classes = n_classes self.n_clusters_per_class = n_clusters_per_class self.weights = weights self.flip_y = flip_y self.class_sep = class_sep self.hypercube = hypercube self.shift = shift self.scale = scale self.random_state = random_state
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_classification patterns, labels = make_classification( n_samples=self.n_samples, n_features=self.n_features, n_informative=self.n_informative, n_redundant=self.n_redundant, n_repeated=self.n_repeated, n_classes=self.n_classes, n_clusters_per_class=self.n_clusters_per_class, weights=self.weights, flip_y=self.flip_y, class_sep=self.class_sep, hypercube=self.hypercube, shift=self.shift, scale=self.scale, random_state=self.random_state) return CDataset(patterns, labels)
[docs]class CDLRandomRegression(CDataLoader): """Generate a random regression problem. The input set can either be well conditioned (by default) or have a low rank-fat tail singular profile. The output is generated by applying a (potentially biased) random linear regression model with `n_informative` nonzero regressors to the previously generated input and some gaussian centered noise with some adjustable scale. Parameters ---------- n_samples : int, optional (default=100) The number of samples. n_features : int, optional (default=100) The number of features. n_informative : int, optional (default=10) The number of informative features, i.e., the number of features used to build the linear model used to generate the output. n_targets : int, optional (default=1) The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar. bias : float, optional (default=0.0) The bias term in the underlying linear model. effective_rank : int or None, optional (default=None) if not None: The approximate number of singular vectors required to explain most of the input data by linear combinations. Using this kind ofsingular spectrum in the input allows the generator to reproduce the correlations often observed in practice. if None: The input set is well conditioned, centered and gaussian with unit variance. tail_strength : float between 0.0 and 1.0, optional (default=0.5) The relative importance of the fat noisy tail of the singular values profile if `effective_rank` is not None. noise : float, optional (default=0.0) The standard deviation of the gaussian noise applied to the output. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'regression' """ __class_type = 'regression' def __init__(self, n_samples=100, n_features=100, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, random_state=None): self.n_samples = n_samples self.n_features = n_features self.n_informative = n_informative self.n_targets = n_targets self.bias = bias self.effective_rank = effective_rank self.tail_strength = tail_strength self.noise = noise self.random_state = random_state
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_regression patterns, labels = make_regression(n_samples=self.n_samples, n_features=self.n_features, n_informative=self.n_informative, n_targets=self.n_targets, bias=self.bias, effective_rank=self.effective_rank, tail_strength=self.tail_strength, noise=self.noise, random_state=self.random_state) return CDataset(patterns, labels)
[docs]class CDLRandomBlobs(CDataLoader): """Generate isotropic Gaussian blobs for clustering. Parameters ---------- n_samples : int, optional (default=100) The total number of points equally divided among clusters. n_features : int, optional (default=2) The number of features for each sample. This parameter will not be considered if centers is different from None centers : int or array of shape [n_centers, n_features] The number of centers to generate (default=3), or the fixed center locations as list of tuples cluster_std: float or sequence of floats, optional (default=1.0) The standard deviation of the clusters. center_box : pair of floats (min, max), optional (default=(-10.0, 10.0)) The bounding box for each cluster center when centers are generated at random. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'blobs' """ __class_type = 'blobs' def __init__(self, n_samples=100, n_features=2, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), random_state=None): self.n_samples = n_samples self.n_features = n_features self.cluster_std = cluster_std self.centers = centers self.center_box = center_box self.random_state = random_state
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_blobs patterns, labels = make_blobs( n_samples=self.n_samples, n_features=self.n_features, centers=self.centers, cluster_std=self.cluster_std, center_box=self.center_box, random_state=self.random_state) return CDataset(patterns, labels)
[docs]class CDLRandomBlobsRegression(CDataLoader): """This class loads blobs regression. Parameters ---------- n_samples : int, optional (default=100) The total number of points equally divided among clusters. centers : int or array of shape [n_centers, n_features], optional (default=3) The number of centers to generate, or the fixed center locations. cluster_std: list of floats, optional (default=(1.0,1.0)) The standard deviation of the clusters. bias : bias that will sum to the function w : the height of every gaussian centers: list of tuple optional (default=([1,1],[-1,-1])) The bounding box for each cluster center when centers are generated at random. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'blobs-regression' """ __class_type = 'blobs-regression' def __init__(self, n_samples=100, cluster_std=(1.0, 1.0), bias=1.0, w=(2.0, -1.0), centers=([0, 0], [-1, -1]), random_state=None): self.n_samples = n_samples self.bias = bias self.w = w self.centers = centers self.cluster_std = cluster_std self.random_state = random_state def _dts_function(self, X): """ TODO: Put a comment for this function. """ from secml.ml.stats import CDistributionGaussian d = X.shape[1] # number of features Y = self.bias for gauss_idx in range(len(self.centers)): Y += self.w[gauss_idx] * \ CDistributionGaussian(mean=self.centers[gauss_idx], cov=self.cluster_std[gauss_idx] * CArray.eye(d, d)).pdf(X) return Y
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_blobs patterns = make_blobs( n_samples=self.n_samples, n_features=2, centers=self.centers, cluster_std=self.cluster_std, random_state=self.random_state)[0] return CDataset(patterns, self._dts_function(CArray(patterns)))
[docs]class CDLRandomCircles(CDataLoader): """Make a large circle containing a smaller circle in 2d. Parameters ---------- n_samples : int, optional (default=100) The total number of points generated. noise : double or None (default=None) Standard deviation of Gaussian noise added to the data. factor : double < 1 (default=.8) Scale factor between inner and outer circle. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'circles' """ __class_type = 'circles' def __init__(self, n_samples=100, noise=None, factor=0.8, random_state=None): self.n_samples = n_samples self.noise = noise self.factor = factor self.random_state = random_state
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_circles patterns, labels = make_circles( n_samples=self.n_samples, noise=self.noise, factor=self.factor, random_state=self.random_state) return CDataset(patterns, labels)
[docs]class CDLRandomCircleRegression(CDataLoader): """Make a large circle containing a smaller circle in 2d. Parameters ---------- n_samples : int, optional (default=100) The total number of points generated. noise : double or None (default=None) Standard deviation of Gaussian noise added to the data. factor : double < 1 (default=.8) Scale factor between inner and outer circle. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'circles-regression' """ __class_type = 'circles-regression' def __init__(self, n_samples=100, noise=None, factor=0.8, random_state=None): self.n_samples = n_samples self.noise = noise self.factor = factor self.random_state = random_state def _dts_function(self, X): """TODO: Add comment for this function!""" return X[:, 0] ** 2 + X[:, 1] ** 2
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_circles patterns = make_circles( n_samples=self.n_samples, noise=self.noise, factor=self.factor, random_state=self.random_state)[0] return CDataset(patterns, self._dts_function(patterns))
[docs]class CDLRandomMoons(CDataLoader): """Make two interleaving half circles. Parameters ---------- n_samples : int, optional (default=100) The total number of points generated. noise : double or None (default=None) Standard deviation of Gaussian noise added to the data. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, is the RandomState instance used by np.random. Attributes ---------- class_type : 'moons' """ __class_type = 'moons' def __init__(self, n_samples=100, noise=None, random_state=None): self.n_samples = n_samples self.noise = noise self.random_state = random_state
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ from sklearn.datasets import make_moons patterns, labels = make_moons( n_samples=self.n_samples, noise=self.noise, random_state=self.random_state) return CDataset(patterns, labels)
[docs]class CDLRandomBinary(CDataLoader): """Generate random binary data. Parameters ---------- n_samples : int, optional (default=100) The total number of points generated. n_features : int, optional (default=2) The total number of features Attributes ---------- class_type : 'binary' """ __class_type = 'binary' def __init__(self, n_samples=100, n_features=2): self.n_samples = n_samples self.n_features = n_features
[docs] def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ patterns = CArray.randint(2, shape=(self.n_samples, self.n_features)) labels = CArray.randint(2, shape=(1, self.n_samples)) return CDataset(patterns, labels)
class CDLRandomToy(CDataLoader, metaclass=ABCMeta): """Loads a random toy dataset (abstract interface). Available toy datasets: - iris (classification) -> `CDLIris` - digits (classification) -> `CDLDigits` - boston (regression) -> `CDLBoston` - diabetes (regression) -> `CDLDiabetes` Parameters ---------- class_list : list of string (default None) Each string is the name of data's class that we want in the new dataset. If None every class will be keep zero_one : bool If is true, and class list is equal to two, will be assigned 0 at the label with lower value, 1 to the other. """ __lock = Lock() # Lock to prevent multiple parallel download/extraction def __init__(self, class_list=None, zero_one=False): self.class_list = class_list self.zero_one = zero_one @property @abstractmethod def toy(self): """Identifier of the toy dataset.""" raise NotImplementedError def _select_classes(self, class_list, patterns, labels): sel_patterns = None sel_labels = None for single_class in class_list: this_class_pat_idx = labels.find(labels == single_class) if sel_patterns is None: sel_patterns = patterns[this_class_pat_idx, :] sel_labels = labels[this_class_pat_idx] else: sel_patterns = sel_patterns.append( patterns[this_class_pat_idx, :], axis=0) sel_labels = sel_labels.append( labels[this_class_pat_idx]) if self.zero_one is True: if len(class_list) > 2: raise ValueError("you are try to convert to 0 1 label for a " "dataset with more than 2 classes") else: class_list.sort() sel_labels[sel_labels == class_list[0]] = 0 sel_labels[sel_labels == class_list[1]] = 1 return CDataset(sel_patterns, sel_labels) def load(self): """Loads the dataset. Returns ------- dataset : CDataset The randomly generated dataset. """ with CDLRandomToy.__lock: if self.toy == 'iris': from sklearn.datasets import load_iris toy_data = load_iris() elif self.toy == 'digits': from sklearn.datasets import load_digits toy_data = load_digits() elif self.toy == 'boston': from sklearn.datasets import load_boston toy_data = load_boston() elif self.toy == 'diabetes': from sklearn.datasets import load_diabetes toy_data = load_diabetes() else: raise ValueError("toy dataset {:} if not available.".format(self.toy)) # Returning a CDataset if self.class_list is None: return CDataset(CArray(toy_data.data), CArray(toy_data.target)) else: return self._select_classes(self.class_list, CArray(toy_data.data), CArray(toy_data.target))
[docs]class CDLIris(CDLRandomToy): """Loads Iris dataset. The iris dataset is a classic and very easy multi-class classification dataset. ================= ============== Classes 3 Samples per class 50 Samples total 150 Dimensionality 4 Features real, positive ================= ============== Parameters ---------- class_list : list of str (default None) Each string is the name of data's class that we want in the new dataset. If None every class will be keep zero_one : bool If is true, and class list is equal to two, will be assigned 0 at the label with lower value, 1 to the other. Attributes ---------- class_type : 'iris' """ __class_type = 'iris' toy = 'iris'
[docs]class CDLDigits(CDLRandomToy): """Loads Digits dataset. The digits dataset is a classic and very easy multi-class classification dataset. Each datapoint is a 8x8 image of a digit. ================= ============== Classes 10 Samples per class ~180 Samples total 1797 Dimensionality 64 Features integers 0-16 ================= ============== Parameters ---------- class_list : list of str (default None) Each string is the name of data's class that we want in the new dataset. If None every class will be keep zero_one : bool If is true, and class list is equal to two, will be assigned 0 at the label with lower value, 1 to the other. Attributes ---------- class_type : 'digits' """ __class_type = 'digits' toy = 'digits'
[docs]class CDLBoston(CDLRandomToy): """Loads Boston dataset. Boston house-prices dataset, useful for regression. ============== ============== Samples total 506 Dimensionality 13 Features real, positive Targets real 5. - 50. ============== ============== Parameters ---------- class_list : list of str (default None) Each string is the name of data's class that we want in the new dataset. If None every class will be keep zero_one : bool If is true, and class list is equal to two, will be assigned 0 at the label with lower value, 1 to the other. Attributes ---------- class_type : 'boston' """ __class_type = 'boston' toy = 'boston'
[docs]class CDLDiabetes(CDLRandomToy): """Loads Diabetes dataset. Diabetes dataset, useful for regression. ============== ================== Samples total 442 Dimensionality 10 Features real, -.2 < x < .2 Targets integer 25 - 346 ============== ================== Parameters ---------- class_list : list of str (default None) Each string is the name of data's class that we want in the new dataset. If None every class will be keep zero_one : bool If is true, and class list is equal to two, will be assigned 0 at the label with lower value, 1 to the other. Attributes ---------- class_type : 'diabetes' """ __class_type = 'diabetes' toy = 'diabetes'