Source code for secml.data.loader.c_dataloader_svmlight

"""
.. module:: CDataLoaderSvmLight
   :synopsis: Load and save a dataset to/from disk.

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from sklearn.datasets import load_svmlight_file, dump_svmlight_file

from secml.data.loader import CDataLoader
from secml.array import CArray
from secml.data import CDataset, CDatasetHeader


[docs]class CDataLoaderSvmLight(CDataLoader):
    """Loads and Saves data in svmlight / libsvm format.

    Attributes
    ----------
    class_type : 'svmlight'

    """
    __class_type = 'svmlight'

    def __init__(self):
        # Does nothing...
        pass
    
[docs]    def load(self, file_path, dtype_samples=float, dtype_labels=float,
             n_features=None, zero_based=True, remove_all_zero=False,
             multilabel=False, load_infos=False):
        """Loads a dataset from the svmlight / libsvm format and
        returns a sparse dataset.

        Datasets must have only numerical feature indices and
        for every pattern indices must be ordered.

        Extra dataset attributes:
         - 'infos', CArray with inline comment for each sample.

        Parameters
        ----------
        file_path : String
            Path to file were dataset are stored into format svmlight or libsvm.
        dtype_samples : str or dtype, optional
            Data-type to which the samples should be casted. Default is float.
        dtype_labels : str or dtype, optional
            Data-type to which the labels should be casted. Default is float.
        n_features : None or int, optional
            The number of features to use.
            If None (default), it will be inferred. This argument is useful
            to load several files that are subsets of a bigger sliced
            dataset: each subset might not have examples of every feature,
            hence the inferred shape might vary from one slice to another.
        zero_based: bool, optional
            Whether column indices are zero-based (True, default) or
            one-based (False). If column indices are set to be one-based,
            they are transformed to zero-based to match
            Python/NumPy conventions.
        remove_all_zero: boolean, optional, default True
            If True every feature which is zero for every pattern
            will be removed from dataset.
        multilabel : boolean, optional
            True if every sample can have more than one label. Default False.
            (see http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
        load_infos : bool, optional
            If True, inline comments will be loaded from the svmlight file
            and stored in the infos CDataset parameter (as CArray).
            Default False.

        Returns
        -------
        dataset : CDataset
            Dataset object that contain patterns and labels.
            If `remove_all_zero` is set to True, the returned dataset
            will have the new argument `idx_mapping` with the mapping of
            the returned features to the original features's indices.

        Examples
        --------
        >>> from secml.data.loader import CDataLoaderSvmLight
        >>> from secml.array import CArray
        >>> patterns = CArray ([[1,0,2], [4,0,5]])
        >>> labels = CArray ([0, 1])
        >>> CDataLoaderSvmLight().dump(CDataset(patterns,labels), "myfile.libsvm")
        >>> new_dataset = CDataLoaderSvmLight().load("myfile.libsvm", remove_all_zero=True)
        >>> print(new_dataset.X)  # doctest: +NORMALIZE_WHITESPACE
        CArray(  (0, 1)	2.0
          (0, 0)	1.0
          (1, 1)	5.0
          (1, 0)	4.0)
        >>> print(new_dataset.Y)
        CArray([0. 1.])
        >>> print(new_dataset.header.idx_mapping)
        CArray([0 2])

        """
        # Never use zero_based='auto' in order to avoid
        # any ambiguity with the features indices...
        patterns, labels = load_svmlight_file(file_path,
                                              n_features=n_features,
                                              dtype=float,
                                              multilabel=multilabel,
                                              zero_based=zero_based)

        patterns = CArray(patterns, tosparse=True, dtype=dtype_samples)
        labels = CArray(labels, dtype=dtype_labels)

        header = CDatasetHeader()  # Will be populated with extra attributes

        if remove_all_zero is True:
            patterns, idx_mapping = \
                CDataLoaderSvmLight._remove_all_zero_features(patterns)
            # Store reverse mapping as extra ds attribute
            header.idx_mapping = idx_mapping

        if load_infos is True:
            infos = []
            with open(file_path, 'rt') as f:
                for l_idx, l in enumerate(f):
                    i = l.split(' # ')
                    if len(i) > 2:  # Line should have only one split point
                        raise ValueError("Something wrong happened when "
                                         "extracting infos for line {:}"
                                         "".format(l_idx))
                    infos.append(i[1].rstrip() if len(i) == 2 else '')
            header.infos = CArray(infos)

        if len(header.get_params()) == 0:
            header = None  # Header is empty, store None in ds

        return CDataset(patterns, labels, header=header)

[docs]    @staticmethod
    def dump(d, f, zero_based=True, comment=None):
        """Dumps a dataset in the svmlight / libsvm file format.

        This format is a text-based format, with one sample per line. 
        It does not store zero valued features hence is suitable for sparse dataset.
        
        The first element of each line can be used to store a target variable to predict.

        Parameters
        ----------
        d : CDataset 
            Contain dataset with patterns and labels that we want store. 
        f : String 
            Path to file were we want store dataset into format svmlight or libsvm.
        zero_based : bool, optional
            Whether column indices should be written zero-based (True, default) or one-based (False).
        comment : string, optional
            Comment to insert at the top of the file.
            This should be either a Unicode string, which will be encoded as UTF-8,
            or an ASCII byte string. If a comment is given, then it will be preceded
            by one that identifies the file as having been dumped by scikit-learn.
            Note that not all tools grok comments in SVMlight files.

        Examples
        --------
        >>> from secml.data.loader import CDataLoaderSvmLight
        >>> from secml.array import CArray
        >>> patterns = CArray([[1,0,2], [4,0,5]])
        >>> labels = CArray([0,1])
        >>> CDataLoaderSvmLight.dump(CDataset(patterns,labels), "myfile.libsvm")

        """
        dump_svmlight_file(d.X.get_data(), d.Y.get_data(), f,
                           zero_based=zero_based, comment=comment)