Source code for secml.data.loader.c_dataloader_svmlight

"""
.. module:: CDataLoaderSvmLight
   :synopsis: Load and save a dataset to/from disk.

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from sklearn.datasets import load_svmlight_file, dump_svmlight_file

from secml.data.loader import CDataLoader
from secml.array import CArray
from secml.data import CDataset, CDatasetHeader


[docs]class CDataLoaderSvmLight(CDataLoader): """Loads and Saves data in svmlight / libsvm format. Attributes ---------- class_type : 'svmlight' """ __class_type = 'svmlight' def __init__(self): # Does nothing... pass
[docs] def load(self, file_path, dtype_samples=float, dtype_labels=float, n_features=None, zero_based=True, remove_all_zero=False, multilabel=False, load_infos=False): """Loads a dataset from the svmlight / libsvm format and returns a sparse dataset. Datasets must have only numerical feature indices and for every pattern indices must be ordered. Extra dataset attributes: - 'infos', CArray with inline comment for each sample. Parameters ---------- file_path : String Path to file were dataset are stored into format svmlight or libsvm. dtype_samples : str or dtype, optional Data-type to which the samples should be casted. Default is float. dtype_labels : str or dtype, optional Data-type to which the labels should be casted. Default is float. n_features : None or int, optional The number of features to use. If None (default), it will be inferred. This argument is useful to load several files that are subsets of a bigger sliced dataset: each subset might not have examples of every feature, hence the inferred shape might vary from one slice to another. zero_based: bool, optional Whether column indices are zero-based (True, default) or one-based (False). If column indices are set to be one-based, they are transformed to zero-based to match Python/NumPy conventions. remove_all_zero: boolean, optional, default True If True every feature which is zero for every pattern will be removed from dataset. multilabel : boolean, optional True if every sample can have more than one label. Default False. (see http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html) load_infos : bool, optional If True, inline comments will be loaded from the svmlight file and stored in the infos CDataset parameter (as CArray). Default False. Returns ------- dataset : CDataset Dataset object that contain patterns and labels. If `remove_all_zero` is set to True, the returned dataset will have the new argument `idx_mapping` with the mapping of the returned features to the original features's indices. Examples -------- >>> from secml.data.loader import CDataLoaderSvmLight >>> from secml.array import CArray >>> patterns = CArray ([[1,0,2], [4,0,5]]) >>> labels = CArray ([0, 1]) >>> CDataLoaderSvmLight().dump(CDataset(patterns,labels), "myfile.libsvm") >>> new_dataset = CDataLoaderSvmLight().load("myfile.libsvm", remove_all_zero=True) >>> print(new_dataset.X) # doctest: +NORMALIZE_WHITESPACE CArray( (0, 1) 2.0 (0, 0) 1.0 (1, 1) 5.0 (1, 0) 4.0) >>> print(new_dataset.Y) CArray([0. 1.]) >>> print(new_dataset.header.idx_mapping) CArray([0 2]) """ # Never use zero_based='auto' in order to avoid # any ambiguity with the features indices... patterns, labels = load_svmlight_file(file_path, n_features=n_features, dtype=float, multilabel=multilabel, zero_based=zero_based) patterns = CArray(patterns, tosparse=True, dtype=dtype_samples) labels = CArray(labels, dtype=dtype_labels) header = CDatasetHeader() # Will be populated with extra attributes if remove_all_zero is True: patterns, idx_mapping = \ CDataLoaderSvmLight._remove_all_zero_features(patterns) # Store reverse mapping as extra ds attribute header.idx_mapping = idx_mapping if load_infos is True: infos = [] with open(file_path, 'rt') as f: for l_idx, l in enumerate(f): i = l.split(' # ') if len(i) > 2: # Line should have only one split point raise ValueError("Something wrong happened when " "extracting infos for line {:}" "".format(l_idx)) infos.append(i[1].rstrip() if len(i) == 2 else '') header.infos = CArray(infos) if len(header.get_params()) == 0: header = None # Header is empty, store None in ds return CDataset(patterns, labels, header=header)
[docs] @staticmethod def dump(d, f, zero_based=True, comment=None): """Dumps a dataset in the svmlight / libsvm file format. This format is a text-based format, with one sample per line. It does not store zero valued features hence is suitable for sparse dataset. The first element of each line can be used to store a target variable to predict. Parameters ---------- d : CDataset Contain dataset with patterns and labels that we want store. f : String Path to file were we want store dataset into format svmlight or libsvm. zero_based : bool, optional Whether column indices should be written zero-based (True, default) or one-based (False). comment : string, optional Comment to insert at the top of the file. This should be either a Unicode string, which will be encoded as UTF-8, or an ASCII byte string. If a comment is given, then it will be preceded by one that identifies the file as having been dumped by scikit-learn. Note that not all tools grok comments in SVMlight files. Examples -------- >>> from secml.data.loader import CDataLoaderSvmLight >>> from secml.array import CArray >>> patterns = CArray([[1,0,2], [4,0,5]]) >>> labels = CArray([0,1]) >>> CDataLoaderSvmLight.dump(CDataset(patterns,labels), "myfile.libsvm") """ dump_svmlight_file(d.X.get_data(), d.Y.get_data(), f, zero_based=zero_based, comment=comment)