Source code for secml.ml.peval.c_perfevaluator

"""
.. module:: PerformanceEvaluation
   :synopsis: Common interface and methods for performance estimation

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Ambra Demontis <ambra.demontis@unica.it>

"""
from abc import ABCMeta, abstractmethod
from collections import OrderedDict
from copy import deepcopy

from secml.core import CCreator
from secml.array import CArray
from secml.data.splitter import CDataSplitter
from secml.ml.peval.metrics import CMetric
from secml.parallel import parfor2


def _evaluate_one(
        row_id, perf_eval, params, params_matrix, estimator, ds, verbose):
    """Evaluate performance of estimator for one combination of parameters.

    Parameters
    ----------
    row_id : int
        Index of the row of params_matrix from which parameters
        to test should be extracted.
    perf_eval : CPerfEvaluator
        Evaluator object that will be used for performance evaluation.
    params : dict
        Dictionary with the parameters to be evaluated.
    params_matrix : CArray
        Indices of each combination of parameters to evaluate.
    estimator : CClassifier
        The classifier for witch we want chose best parameters.
    ds : CDataset
        Dataset to be used for evaluating parameters.
    verbose : int
        Sets verbosity level of the performance evaluator object.

    """
    # Build a dictionary with parameters to evaluate
    estimator_params = {}
    for par_idx, par in enumerate(params):
        # This works as params is an OrderedDict
        value_id = params_matrix[row_id, par_idx].item()
        estimator_params[par] = params[par][value_id]

    # Set estimator parameters using current combination
    estimator.set_params(estimator_params)

    # Resetting verbosity level as parallel copy the object
    perf_eval.verbose = verbose

    # Compute performance for current params set
    eval_score = perf_eval.compute_performance(estimator, ds)

    perf_eval.logger.info(
        "Params: {:} - Score: {:}".format(estimator_params, eval_score))

    return eval_score


[docs]class CPerfEvaluator(CCreator, metaclass=ABCMeta): """Evaluate the best parameters for input estimator. Parameters ---------- splitter : CDataSplitter or str Object to use for splitting the dataset into train and validation. metric : CMetric or str Name of the metric that we want maximize / minimize. """ __super__ = 'CPerfEvaluator' def __init__(self, splitter, metric): self.splitter = CDataSplitter.create(splitter) self.metric = CMetric.create(metric)
[docs] def evaluate_params( self, estimator, dataset, parameters, pick='first', n_jobs=1): """Evaluate parameters for input estimator on input dataset. Parameters ---------- estimator : CClassifier The classifier for witch we want chose best parameters. dataset : CDataset Dataset to be used for evaluating parameters. parameters : dict Dictionary with each entry as {parameter: list of values to test}. pick : {'first', 'last', 'random'}, optional Defines which of the best parameters set pick. Usually, 'first' (default) correspond to the smallest parameters while 'last' correspond to the biggest. The order is consistent to the parameters dict passed as input. n_jobs : int, optional Number of parallel workers to use. Default 1. Cannot be higher than processor's number of cores. Returns ------- best_param_dict : dict A dictionary with the best value for each evaluated parameter. best_value : any Metric value obtained on validation set by the estimator. """ self.logger.info("Parameters to evaluate: {:}".format(parameters)) # FIRST OF ALL: save current classifier to restore later original_estimator = deepcopy(estimator) # Compute dataset splits self.splitter.compute_indices(dataset) # OrderedDict returns keys always in the same order, # so we are safe when iterating on params_matrix.shape[1] parameters = OrderedDict( sorted(parameters.items(), key=lambda t: t[0])) params_idx = [] # create a list of list 'param_idx' with index of parameters' values for param_name in parameters: if not isinstance(parameters[param_name], list): raise TypeError("values for parameter `{:}` must be " "specified as a list.".format(param_name)) # Add an index for each parameter's value params_idx.append(list(range(len(parameters[param_name])))) # this is a matrix of indices.... e.g. [[1,1] [1,2], ..] # each row corresponds to the indices of parameters to be set params_matrix = CArray.comblist(params_idx).astype(int) # Parallelize (if requested) over the rows of params_matrix res_vect = parfor2(_evaluate_one, params_matrix.shape[0], n_jobs, self, parameters, params_matrix, estimator, dataset, self.verbose) # Transforming the list to array res_vect = CArray(res_vect) # Retrieve the best parameters best_params_dict, best_value = self._get_best_params( res_vect, parameters, params_matrix, pick=pick) self.logger.info("Best params: {:} - Value: {:}".format( best_params_dict, best_value)) # Restore original parameters of classifier for param in original_estimator.__dict__: estimator.__dict__[param] = original_estimator.__dict__[param] return best_params_dict, best_value
[docs] @abstractmethod def compute_performance(self, estimator, dataset): """Compute estimator performance on input dataset. This must be reimplemented by subclasses. Parameters ---------- estimator : CClassifier The classifier that we want evaluate. dataset : CDataset Dataset that we want use for evaluate the classifier. Returns ------- score : float Performance score of estimator. """ raise NotImplementedError()
@abstractmethod def _get_best_params(self, res_vect, params, params_matrix, pick='first'): """Returns the best parameters given input performance data. Parameters ---------- res_vect : CArray Array with the performance results associated to each parameters combination. params : dict Dictionary with the parameters to be evaluated. params_matrix : CArray Indices of each combination of parameters to evaluate. pick : {'first', 'last', 'random'}, optional Defines which of the best parameters set pick. Usually, 'first' (default) correspond to the smallest parameters while 'last' correspond to the biggest. The order is consistent to the parameters dict passed as input. Returns ------- best_params_dict : dict Dictionary with the parameters that have obtained the best performance score. best_value : any Performance value associated with the best parameters. """ raise NotImplementedError()