Source code for secml.data.selection.c_ps_kmedians
"""
.. module:: PrototypesSelectorKMedians
:synopsis: Selector of prototypes using k-medians strategy.
.. moduleauthor:: Marco Melis <marco.melis@unica.it>
"""
from secml.data.selection import CPrototypesSelector
from secml.array import CArray
from secml.ml.kernels import CKernelEuclidean
[docs]class CPSKMedians(CPrototypesSelector):
"""Selection of Prototypes using K-Medians strategy.
Runs a k-means clustering to obtain a set of clusters from
the dataset. Then selects the prototypes as their set medians.
References
----------
Spillmann, Barbara, et al. "Transforming strings to vector
spaces using prototype selection." Structural, Syntactic,
and Statistical Pattern Recognition.
Springer Berlin Heidelberg, 2006. 287-296.
Attributes
----------
class_type : 'k-medians'
"""
__class_type = 'k-medians'
[docs] def select(self, dataset, n_prototypes, random_state=None):
"""Selects the prototypes from input dataset.
Parameters
----------
dataset : CDataset
Dataset from which prototypes should be selected
n_prototypes : int
Number of prototypes to be selected.
random_state : int, RandomState or None, optional
Determines random number generation for centroid initialization.
Default None.
Returns
-------
reduced_ds : CDataset
Dataset with selected prototypes.
"""
from sklearn.cluster import k_means
km = k_means(dataset.X.tondarray(), n_clusters=n_prototypes,
random_state=random_state)
km_labels = CArray(km[1])
# Precomputing distances
k_euclidean = - CKernelEuclidean().k(dataset.X)
# List of selected prototypes (indices)
sel_idx = []
for i in range(n_prototypes):
# Find the samples associated with each cluster
cluster_indices = km_labels.find(km_labels == i)
if len(cluster_indices) == 0: # No sample in the cluster?!
raise ValueError("No sample in the cluster {:}".format(i))
elif len(cluster_indices) == 1: # One sample in the cluster
p = 0
else: # Compute the median prototype
p = k_euclidean[cluster_indices, cluster_indices]
# Compute the median prototype
p = p.sum(axis=0, keepdims=False).argmin()
sel_idx.append(cluster_indices[p])
self.logger.debug("Selecting samples: {:}".format(sel_idx))
self._sel_idx = CArray(sel_idx)
# Returning the reduced training set
return dataset[self._sel_idx, :]