Source code for secml.data.selection.c_ps_kmedians

"""
.. module:: PrototypesSelectorKMedians
   :synopsis: Selector of prototypes using k-medians strategy.

.. moduleauthor:: Marco Melis <marco.melis@unica.it>

"""
from secml.data.selection import CPrototypesSelector
from secml.array import CArray
from secml.ml.kernels import CKernelEuclidean


[docs]class CPSKMedians(CPrototypesSelector):
    """Selection of Prototypes using K-Medians strategy.

    Runs a k-means clustering to obtain a set of clusters from
    the dataset. Then selects the prototypes as their set medians.

    References
    ----------
    Spillmann, Barbara, et al. "Transforming strings to vector
    spaces using prototype selection." Structural, Syntactic,
    and Statistical Pattern Recognition.
    Springer Berlin Heidelberg, 2006. 287-296.

    Attributes
    ----------
    class_type : 'k-medians'

    """
    __class_type = 'k-medians'

[docs]    def select(self, dataset, n_prototypes, random_state=None):
        """Selects the prototypes from input dataset.

        Parameters
        ----------
        dataset : CDataset
            Dataset from which prototypes should be selected
        n_prototypes : int
            Number of prototypes to be selected.
        random_state : int, RandomState or None, optional
            Determines random number generation for centroid initialization.
            Default None.

        Returns
        -------
        reduced_ds : CDataset
            Dataset with selected prototypes.

        """
        from sklearn.cluster import k_means
        km = k_means(dataset.X.tondarray(), n_clusters=n_prototypes,
                     random_state=random_state)
        km_labels = CArray(km[1])
        # Precomputing distances
        k_euclidean = - CKernelEuclidean().k(dataset.X)
        # List of selected prototypes (indices)
        sel_idx = []
        for i in range(n_prototypes):
            # Find the samples associated with each cluster
            cluster_indices = km_labels.find(km_labels == i)
            if len(cluster_indices) == 0:  # No sample in the cluster?!
                raise ValueError("No sample in the cluster {:}".format(i))
            elif len(cluster_indices) == 1:  # One sample in the cluster
                p = 0
            else:  # Compute the median prototype
                p = k_euclidean[cluster_indices, cluster_indices]
                # Compute the median prototype
                p = p.sum(axis=0, keepdims=False).argmin()

            sel_idx.append(cluster_indices[p])

        self.logger.debug("Selecting samples: {:}".format(sel_idx))

        self._sel_idx = CArray(sel_idx)

        # Returning the reduced training set
        return dataset[self._sel_idx, :]