Source code for secml.adv.attacks.evasion.foolbox.fb_attacks.fb_cw_attack

"""
.. module:: CFoolboxL2CarliniWagner
    :synopsis: Performs Foolbox CW L2 attack.

.. moduleauthor:: Luca Demetrio <luca.demetrio@dibris.unige.it>
.. moduleauthor:: Maura Pintor <maura.pintor@unica.it>

"""

from functools import partial

import eagerpy as ep
import foolbox as fb
import numpy as np
from foolbox import Misclassification, TargetedMisclassification
from foolbox.attacks.base import raise_if_kwargs, get_criterion
from foolbox.attacks.carlini_wagner import _to_attack_space, _to_model_space, best_other_classes, AdamOptimizer
from foolbox.devutils import flatten, atleast_kd

from secml.adv.attacks.evasion.foolbox.c_attack_evasion_foolbox import CAttackEvasionFoolbox
from secml.adv.attacks.evasion.foolbox.losses.cw_loss import CWLoss
from secml.adv.attacks.evasion.foolbox.secml_autograd import as_tensor
from secml.array import CArray


[docs]class CFoolboxL2CarliniWagner(CWLoss, CAttackEvasionFoolbox): """ Carlini & Wagner L2 Attack [#Carl16]_. Credits: https://github.com/bethgelab/foolbox/blob/master/foolbox/attacks/carlini_wagner.py Parameters ---------- classifier : CClassifier Trained secml classifier. y_target : int or None, optional If None an indiscriminate attack will be performed, else a targeted attack to have the samples misclassified as belonging to the y_target class. lb : float or None, optional Lower bound of the model's input space. ub : float or None, optional Upper bound of the model's input space. binary_search_steps : int, optional The number of steps to perform in the binary search over the constant c. steps : int, optional Number of update steps to perform within each binary search step. stepsize : float, optional Stepsize to update the examples. confidence : float, optional Confidence required to mark an example as adversarial. Controls the gap between decision boundary and adversarial example. initial_const : float, optional Initial value of the constant c when the binary search starts. abort_early : bool, optional Stop inner search when an adversarial example has been found. It does not affect the binary search. References ---------- .. [#Carl16] Nicholas Carlini, David Wagner, "Towards evaluating the robustness of neural networks. In 2017 ieee symposium on security and privacy" https://arxiv.org/abs/1608.04644 """ __class_type = 'e-foolbox-cw' def __init__(self, classifier, y_target=None, lb=0.0, ub=1.0, binary_search_steps=9, steps=10000, stepsize=1e-2, confidence=0, initial_const=1e-3, abort_early=True): super(CFoolboxL2CarliniWagner, self).__init__(classifier, y_target, lb=lb, ub=ub, fb_attack_class=_L2CarliniWagnerAttack, epsilons=None, binary_search_steps=binary_search_steps, steps=steps, stepsize=stepsize, confidence=confidence, initial_const=initial_const, abort_early=abort_early) self.confidence = confidence self.c = initial_const self._x0 = None self._y0 = None self.distance = 'l2' self._step_per_iter = None self.best_c_ = self.c def _run(self, x, y, x_init=None): self._x0 = as_tensor(x) self._y0 = as_tensor(y) out, _ = super(CFoolboxL2CarliniWagner, self)._run(x, y, x_init) self._consts = self.attack.consts self._f_seq = self.objective_function(self.x_seq) self.best_c_ = self._consts[self.attack._best_const] f_opt = self.objective_function(out) return out, f_opt @property def all_x_seq(self) -> list: divided_paths = self._slice_path() return divided_paths def _slice_path(self): all_paths = super(CFoolboxL2CarliniWagner, self).x_seq divided_paths = [] for i, s in enumerate(self.attack._steps_per_iter): cumulative_sum = sum(self.attack._steps_per_iter[:i]) divided_paths.append(all_paths[cumulative_sum: cumulative_sum + s, :]) return divided_paths @property def x_seq(self): last_path = self._slice_path()[self.attack._best_const] return last_path
class _L2CarliniWagnerAttack(fb.attacks.L2CarliniWagnerAttack): def run(self, model, inputs, criterion, *, early_stop, **kwargs): raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs N = len(x) if isinstance(criterion_, Misclassification): targeted = False classes = criterion_.labels change_classes_logits = self.confidence elif isinstance(criterion_, TargetedMisclassification): targeted = True classes = criterion_.target_classes change_classes_logits = -self.confidence else: raise ValueError("unsupported criterion") def is_adversarial(perturbed: ep.Tensor, logits: ep.Tensor) -> ep.Tensor: if change_classes_logits != 0: logits += ep.onehot_like(logits, classes, value=change_classes_logits) return criterion_(perturbed, logits) if classes.shape != (N,): name = "target_classes" if targeted else "labels" raise ValueError( f"expected {name} to have shape ({N},), got {classes.shape}" ) bounds = model.bounds to_attack_space = partial(_to_attack_space, bounds=bounds) to_model_space = partial(_to_model_space, bounds=bounds) x_attack = to_attack_space(x) reconstsructed_x = to_model_space(x_attack) rows = range(N) def loss_fun(delta, consts): assert delta.shape == x_attack.shape assert consts.shape == (N,) x = to_model_space(x_attack + delta) logits = model(x) if targeted: c_minimize = best_other_classes(logits, classes) c_maximize = classes # target_classes else: c_minimize = classes # labels c_maximize = best_other_classes(logits, classes) is_adv_loss = logits[rows, c_minimize] - logits[rows, c_maximize] assert is_adv_loss.shape == (N,) is_adv_loss = is_adv_loss + self.confidence is_adv_loss = ep.maximum(0, is_adv_loss) is_adv_loss = is_adv_loss * consts squared_norms = flatten(x - reconstsructed_x).square().sum(axis=-1) loss = is_adv_loss.sum() + squared_norms.sum() return loss, (x, logits) loss_aux_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=True) consts = self.initial_const * np.ones((N,)) lower_bounds = np.zeros((N,)) upper_bounds = np.inf * np.ones((N,)) best_advs = ep.zeros_like(x) best_advs_norms = ep.full(x, (N,), ep.inf) self._consts = [] self._steps_per_iter = [] self._best_const = -1 # the binary search searches for the smallest consts that produce adversarials for binary_search_step in range(self.binary_search_steps): if ( binary_search_step == self.binary_search_steps - 1 and self.binary_search_steps >= 10 ): # in the last binary search step, repeat the search once consts = np.minimum(upper_bounds, 1e10) iter_step = 0 # create a new optimizer find the delta that minimizes the loss delta = ep.zeros_like(x_attack) optimizer = AdamOptimizer(delta) # tracks whether adv with the current consts was found found_advs = np.full((N,), fill_value=False) loss_at_previous_check = np.inf consts_ = ep.from_numpy(x, consts.astype(np.float32)) for step in range(self.steps): loss, (perturbed, logits), gradient = loss_aux_and_grad(delta, consts_) delta += optimizer(gradient, self.stepsize) if self.abort_early and step % (np.ceil(self.steps / 10)) == 0: # after each tenth of the overall steps, check progress if not (loss <= 0.9999 * loss_at_previous_check): break # stop Adam if there has been no progress loss_at_previous_check = loss iter_step += 1 found_advs_iter = is_adversarial(perturbed, logits) found_advs = np.logical_or(found_advs, found_advs_iter.numpy()) norms = flatten(perturbed - x).norms.l2(axis=-1) closer = norms < best_advs_norms new_best = ep.logical_and(closer, found_advs_iter) if closer and found_advs_iter: self._best_const = binary_search_step new_best_ = atleast_kd(new_best, best_advs.ndim) best_advs = ep.where(new_best_, perturbed, best_advs) best_advs_norms = ep.where(new_best, norms, best_advs_norms) self._consts.append(consts_.numpy().tolist()) self._steps_per_iter.append(iter_step) upper_bounds = np.where(found_advs, consts, upper_bounds) lower_bounds = np.where(found_advs, lower_bounds, consts) consts_exponential_search = consts * 10 consts_binary_search = (lower_bounds + upper_bounds) / 2 consts = np.where( np.isinf(upper_bounds), consts_exponential_search, consts_binary_search ) return restore_type(best_advs) @property def consts(self): return CArray(self._consts).ravel()