Source code for secml.data.loader.c_dataloader_sklearn

"""
.. module:: CDataLoaderSklearn
   :synopsis: Collection of dataset loaders from sklearn library.

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from multiprocessing import Lock

from abc import ABCMeta, abstractmethod

from secml.data.loader import CDataLoader
from secml.data import CDataset
from secml.array import CArray

__all__ = ['CDLRandom', 'CDLRandomRegression',
           'CDLRandomBlobs', 'CDLRandomBlobsRegression',
           'CDLRandomCircles', 'CDLRandomCircleRegression',
           'CDLRandomMoons', 'CDLRandomBinary',
           'CDLIris', 'CDLDigits', 'CDLBoston', 'CDLDiabetes']


[docs]class CDLRandom(CDataLoader):
    """Class for loading random data.

    Generate a random n-class classification problem.

    This initially creates clusters of points normally distributed (std=1)
    about vertices of a 2 * class_sep-sided hypercube, and assigns an equal
    number of clusters to each class.

    It introduces interdependence between these features and adds various
    types of further noise to the data.

    Prior to shuffling, X stacks a number of these primary "informative"
    features, "redundant" linear combinations of these,
    "repeated" duplicates of sampled features,
    and arbitrary noise for and remaining features.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The number of samples.
    n_features : int, optional (default=20)
        The total number of features.
        These comprise n_informative informative features,
        n_redundant redundant features, n_repeated duplicated
        features and ``n_features - n_informative
        - n_redundant - n_repeated``
        useless features drawn at random.
    n_informative : int, optional (default=2)
        The number of informative features.
        Each class is composed of a number of gaussian clusters each
        located around the vertices of a hypercube in a subspace of
        dimension n_informative. For each cluster, informative
        features are drawn independently from N(0, 1) and then randomly
        linearly combined within each cluster in order to add covariance.
        The clusters are then placed on the vertices of the hypercube.
    n_redundant : int, optional (default=2)
        The number of redundant features.
        These features are generated as random linear combinations of
        the informative features.
    n_repeated : int, optional (default=0)
        The number of duplicated features, drawn randomly from the
        informative and the redundant features.
    n_classes : int, optional (default=2)
        The number of classes (or labels) of the classification problem.
    n_clusters_per_class : int, optional (default=2)
        The number of clusters per class.
    weights : list of floats or None (default=None)
        The proportions of samples assigned to each class.
        If None, then classes are balanced. Note that if
        ``len(weights) == n_classes - 1``, then the last
        class weight is automatically inferred.
        More than n_samples samples may be returned if the sum
        of weights exceeds 1.
    flip_y : float, optional (default=0.01)
        The fraction of samples whose class are randomly exchanged.
    class_sep : float, optional (default=1.0)
        The factor multiplying the hypercube dimension.
    hypercube : bool, optional (default=True)
        If True, the clusters are put on the vertices of a hypercube.
        If False, the clusters are put on the vertices
        of a random polytope.
    shift : float, array of shape [n_features] or None, optional (default=0.0)
        Shift features by the specified value. If None, then features
        are shifted by a random value drawn in [-class_sep, class_sep].
    scale : float, array of shape [n_features] or None, optional (default=1.0)
        Multiply features by the specified value. If None, then features
        are scaled by a random value drawn in [1, 100]. Note that scaling
        happens after shifting.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'classification'

    """
    __class_type = 'classification'

    def __init__(self, n_samples=100, n_features=20, n_informative=2,
                 n_redundant=2, n_repeated=0, n_classes=2,
                 n_clusters_per_class=2, weights=None,
                 flip_y=0.01, class_sep=1.0, hypercube=True,
                 shift=0.0, scale=1.0, random_state=None):

        self.n_samples = n_samples
        self.n_features = n_features
        self.n_informative = n_informative
        self.n_redundant = n_redundant
        self.n_repeated = n_repeated
        self.n_classes = n_classes
        self.n_clusters_per_class = n_clusters_per_class
        self.weights = weights
        self.flip_y = flip_y
        self.class_sep = class_sep
        self.hypercube = hypercube
        self.shift = shift
        self.scale = scale
        self.random_state = random_state

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_classification
        patterns, labels = make_classification(
            n_samples=self.n_samples,
            n_features=self.n_features,
            n_informative=self.n_informative,
            n_redundant=self.n_redundant,
            n_repeated=self.n_repeated,
            n_classes=self.n_classes,
            n_clusters_per_class=self.n_clusters_per_class,
            weights=self.weights,
            flip_y=self.flip_y,
            class_sep=self.class_sep,
            hypercube=self.hypercube,
            shift=self.shift,
            scale=self.scale,
            random_state=self.random_state)
        return CDataset(patterns, labels)


[docs]class CDLRandomRegression(CDataLoader):
    """Generate a random regression problem.

    The input set can either be well conditioned (by default) or have a low
    rank-fat tail singular profile.

    The output is generated by applying a (potentially biased)
    random linear regression model with `n_informative` nonzero
    regressors to the previously generated input and some gaussian
    centered noise with some adjustable scale.
    
    Parameters
    ----------
    n_samples : int, optional (default=100)
        The number of samples.
    n_features : int, optional (default=100)
        The number of features.
    n_informative : int, optional (default=10)
        The number of informative features, i.e.,
        the number of features used to build the linear model
        used to generate the output.
    n_targets : int, optional (default=1)
        The number of regression targets, i.e.,
        the dimension of the y output vector associated with a sample.
        By default, the output is a scalar.
    bias : float, optional (default=0.0)
        The bias term in the underlying linear model.
    effective_rank : int or None, optional (default=None)
        if not None:
            The approximate number of singular vectors
            required to explain most of the input data
            by linear combinations.
            Using this kind ofsingular spectrum in the input
            allows the generator to reproduce
            the correlations often observed in practice.
        if None:
            The input set is well conditioned, centered and gaussian with
            unit variance.
    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
        The relative importance of the fat noisy
        tail of the singular values
        profile if `effective_rank` is not None.
    noise : float, optional (default=0.0)
        The standard deviation of the gaussian noise applied to the output.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'regression'

    """
    __class_type = 'regression'

    def __init__(self, n_samples=100, n_features=100, n_informative=10,
                 n_targets=1, bias=0.0, effective_rank=None,
                 tail_strength=0.5, noise=0.0, random_state=None):

        self.n_samples = n_samples
        self.n_features = n_features
        self.n_informative = n_informative
        self.n_targets = n_targets
        self.bias = bias
        self.effective_rank = effective_rank
        self.tail_strength = tail_strength
        self.noise = noise
        self.random_state = random_state

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_regression
        patterns, labels = make_regression(n_samples=self.n_samples,
                                           n_features=self.n_features,
                                           n_informative=self.n_informative,
                                           n_targets=self.n_targets,
                                           bias=self.bias,
                                           effective_rank=self.effective_rank,
                                           tail_strength=self.tail_strength,
                                           noise=self.noise,
                                           random_state=self.random_state)
        return CDataset(patterns, labels)


[docs]class CDLRandomBlobs(CDataLoader):
    """Generate isotropic Gaussian blobs for clustering.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The total number of points equally divided among clusters.
    n_features : int, optional (default=2)
        The number of features for each sample.
        This parameter will not be considered if centers is different
        from None
    centers : int or array of shape [n_centers, n_features]
        The number of centers to generate (default=3),
        or the fixed center locations as list of tuples
    cluster_std: float or sequence of floats, optional (default=1.0)
        The standard deviation of the clusters.
    center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
        The bounding box for each cluster center
        when centers are generated at random.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'blobs'

    """
    __class_type = 'blobs'

    def __init__(self, n_samples=100, n_features=2, centers=3,
                 cluster_std=1.0, center_box=(-10.0, 10.0), random_state=None):

        self.n_samples = n_samples
        self.n_features = n_features
        self.cluster_std = cluster_std
        self.centers = centers
        self.center_box = center_box
        self.random_state = random_state

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_blobs
        patterns, labels = make_blobs(
            n_samples=self.n_samples,
            n_features=self.n_features,
            centers=self.centers,
            cluster_std=self.cluster_std,
            center_box=self.center_box,
            random_state=self.random_state)

        return CDataset(patterns, labels)


[docs]class CDLRandomBlobsRegression(CDataLoader):
    """This class loads blobs regression.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The total number of points equally divided among clusters.
    centers : int or array of shape [n_centers, n_features], optional (default=3)
        The number of centers to generate, or the fixed center locations.
    cluster_std: list of floats, optional (default=(1.0,1.0))
        The standard deviation of the clusters.
    bias : bias that will sum to the function
    w : the height of every gaussian
    centers: list of tuple optional (default=([1,1],[-1,-1]))
        The bounding box for each cluster center when centers are
        generated at random.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'blobs-regression'

    """
    __class_type = 'blobs-regression'

    def __init__(self, n_samples=100, cluster_std=(1.0, 1.0),
                 bias=1.0, w=(2.0, -1.0), centers=([0, 0], [-1, -1]),
                 random_state=None):

        self.n_samples = n_samples
        self.bias = bias
        self.w = w
        self.centers = centers
        self.cluster_std = cluster_std
        self.random_state = random_state

    def _dts_function(self, X):
        """ TODO: Put a comment for this function. """
        from secml.ml.stats import CDistributionGaussian
        d = X.shape[1]  # number of features
        Y = self.bias
        for gauss_idx in range(len(self.centers)):
            Y += self.w[gauss_idx] * \
                 CDistributionGaussian(mean=self.centers[gauss_idx],
                                       cov=self.cluster_std[gauss_idx] *
                                       CArray.eye(d, d)).pdf(X)
        return Y

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_blobs
        patterns = make_blobs(
            n_samples=self.n_samples, n_features=2, centers=self.centers,
            cluster_std=self.cluster_std, random_state=self.random_state)[0]
        return CDataset(patterns, self._dts_function(CArray(patterns)))


[docs]class CDLRandomCircles(CDataLoader):
    """Make a large circle containing a smaller circle in 2d.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The total number of points generated.
    noise : double or None (default=None)
        Standard deviation of Gaussian noise added to the data.
    factor : double < 1 (default=.8)
        Scale factor between inner and outer circle.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'circles'

    """
    __class_type = 'circles'

    def __init__(self, n_samples=100, noise=None,
                 factor=0.8, random_state=None):

        self.n_samples = n_samples
        self.noise = noise
        self.factor = factor
        self.random_state = random_state

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_circles
        patterns, labels = make_circles(
            n_samples=self.n_samples,
            noise=self.noise,
            factor=self.factor,
            random_state=self.random_state)
        return CDataset(patterns, labels)


[docs]class CDLRandomCircleRegression(CDataLoader):
    """Make a large circle containing a smaller circle in 2d.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The total number of points generated.
    noise : double or None (default=None)
        Standard deviation of Gaussian noise added to the data.
    factor : double < 1 (default=.8)
        Scale factor between inner and outer circle.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'circles-regression'

    """
    __class_type = 'circles-regression'

    def __init__(self, n_samples=100, noise=None,
                 factor=0.8, random_state=None):

        self.n_samples = n_samples
        self.noise = noise
        self.factor = factor
        self.random_state = random_state

    def _dts_function(self, X):
        """TODO: Add comment for this function!"""
        return X[:, 0] ** 2 + X[:, 1] ** 2

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_circles
        patterns = make_circles(
            n_samples=self.n_samples,
            noise=self.noise,
            factor=self.factor,
            random_state=self.random_state)[0]
        return CDataset(patterns, self._dts_function(patterns))


[docs]class CDLRandomMoons(CDataLoader):
    """Make two interleaving half circles.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The total number of points generated.
    noise : double or None (default=None)
        Standard deviation of Gaussian noise added to the data.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, is the RandomState instance used by np.random.

    Attributes
    ----------
    class_type : 'moons'

    """
    __class_type = 'moons'

    def __init__(self, n_samples=100, noise=None, random_state=None):

        self.n_samples = n_samples
        self.noise = noise
        self.random_state = random_state

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_moons
        patterns, labels = make_moons(
            n_samples=self.n_samples,
            noise=self.noise,
            random_state=self.random_state)
        return CDataset(patterns, labels)


[docs]class CDLRandomBinary(CDataLoader):
    """Generate random binary data.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The total number of points generated.
    n_features : int, optional (default=2)
        The total number of features

    Attributes
    ----------
    class_type : 'binary'

    """
    __class_type = 'binary'

    def __init__(self, n_samples=100, n_features=2):

        self.n_samples = n_samples
        self.n_features = n_features

[docs]    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        patterns = CArray.randint(2, shape=(self.n_samples, self.n_features))
        labels = CArray.randint(2, shape=(1, self.n_samples))
        return CDataset(patterns, labels)


class CDLRandomToy(CDataLoader, metaclass=ABCMeta):
    """Loads a random toy dataset (abstract interface).

    Available toy datasets:
     - iris (classification) -> `CDLIris`
     - digits (classification) -> `CDLDigits`
     - boston (regression) -> `CDLBoston`
     - diabetes (regression) -> `CDLDiabetes`

    Parameters
    ----------
    class_list : list of string (default None)
        Each string is the name of data's class that we want
        in the new dataset.  If None every class will be keep
    zero_one : bool
        If is true, and class list is equal to two, will be
        assigned 0 at the label with lower value, 1 to the other.

    """
    __lock = Lock()  # Lock to prevent multiple parallel download/extraction

    def __init__(self, class_list=None, zero_one=False):

        self.class_list = class_list
        self.zero_one = zero_one

    @property
    @abstractmethod
    def toy(self):
        """Identifier of the toy dataset."""
        raise NotImplementedError

    def _select_classes(self, class_list, patterns, labels):

        sel_patterns = None
        sel_labels = None

        for single_class in class_list:
            this_class_pat_idx = labels.find(labels == single_class)

            if sel_patterns is None:
                sel_patterns = patterns[this_class_pat_idx, :]
                sel_labels = labels[this_class_pat_idx]
            else:
                sel_patterns = sel_patterns.append(
                    patterns[this_class_pat_idx, :], axis=0)
                sel_labels = sel_labels.append(
                    labels[this_class_pat_idx])

        if self.zero_one is True:
            if len(class_list) > 2:
                raise ValueError("you are try to convert to 0 1 label for a "
                                 "dataset with more than 2 classes")
            else:
                class_list.sort()
                sel_labels[sel_labels == class_list[0]] = 0
                sel_labels[sel_labels == class_list[1]] = 1

        return CDataset(sel_patterns, sel_labels)

    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        with CDLRandomToy.__lock:
            if self.toy == 'iris':
                from sklearn.datasets import load_iris
                toy_data = load_iris()
            elif self.toy == 'digits':
                from sklearn.datasets import load_digits
                toy_data = load_digits()
            elif self.toy == 'boston':
                from sklearn.datasets import load_boston
                toy_data = load_boston()
            elif self.toy == 'diabetes':
                from sklearn.datasets import load_diabetes
                toy_data = load_diabetes()
            else:
                raise ValueError("toy dataset {:} if not available.".format(self.toy))

        # Returning a CDataset
        if self.class_list is None:
            return CDataset(CArray(toy_data.data), CArray(toy_data.target))
        else:
            return self._select_classes(self.class_list,
                                        CArray(toy_data.data),
                                        CArray(toy_data.target))


[docs]class CDLIris(CDLRandomToy):
    """Loads Iris dataset.

    The iris dataset is a classic and very easy multi-class
    classification dataset.

    =================   ==============
    Classes                          3
    Samples per class               50
    Samples total                  150
    Dimensionality                   4
    Features            real, positive
    =================   ==============

    Parameters
    ----------
    class_list : list of str (default None)
        Each string is the name of data's class that we want
        in the new dataset.  If None every class will be keep
    zero_one : bool
        If is true, and class list is equal to two, will be
        assigned 0 at the label with lower value, 1 to the other.

    Attributes
    ----------
    class_type : 'iris'

    """
    __class_type = 'iris'
    toy = 'iris'


[docs]class CDLDigits(CDLRandomToy):
    """Loads Digits dataset.

    The digits dataset is a classic and very easy multi-class
    classification dataset. Each datapoint is a 8x8 image of a digit.

    =================   ==============
    Classes                         10
    Samples per class             ~180
    Samples total                 1797
    Dimensionality                  64
    Features             integers 0-16
    =================   ==============

    Parameters
    ----------
    class_list : list of str (default None)
        Each string is the name of data's class that we want
        in the new dataset.  If None every class will be keep
    zero_one : bool
        If is true, and class list is equal to two, will be
        assigned 0 at the label with lower value, 1 to the other.

    Attributes
    ----------
    class_type : 'digits'

    """
    __class_type = 'digits'
    toy = 'digits'


[docs]class CDLBoston(CDLRandomToy):
    """Loads Boston dataset.

    Boston house-prices dataset, useful for regression.

    ==============     ==============
    Samples total                 506
    Dimensionality                 13
    Features           real, positive
    Targets             real 5. - 50.
    ==============     ==============

    Parameters
    ----------
    class_list : list of str (default None)
        Each string is the name of data's class that we want
        in the new dataset.  If None every class will be keep
    zero_one : bool
        If is true, and class list is equal to two, will be
        assigned 0 at the label with lower value, 1 to the other.

    Attributes
    ----------
    class_type : 'boston'

    """
    __class_type = 'boston'
    toy = 'boston'


[docs]class CDLDiabetes(CDLRandomToy):
    """Loads Diabetes dataset.

    Diabetes dataset, useful for regression.

    ==============      ==================
    Samples total       442
    Dimensionality      10
    Features            real, -.2 < x < .2
    Targets             integer 25 - 346
    ==============      ==================

    Parameters
    ----------
    class_list : list of str (default None)
        Each string is the name of data's class that we want
        in the new dataset.  If None every class will be keep
    zero_one : bool
        If is true, and class list is equal to two, will be
        assigned 0 at the label with lower value, 1 to the other.

    Attributes
    ----------
    class_type : 'diabetes'

    """
    __class_type = 'diabetes'
    toy = 'diabetes'