Source code for secml.data.loader.c_dataloader_icubworld

"""
.. module:: CDataLoaderICubWorld28
   :synopsis: Loader of the ICubWorld dataset

.. moduleauthor:: Marco Melis <marco.melis@unica.it>
.. moduleauthor:: Angelo Sotgiu

"""
from multiprocessing import Lock
import zipfile
import os
from fnmatch import fnmatch

from abc import ABCMeta, abstractmethod

from PIL import Image

from secml import settings
from secml.array import CArray
from secml.data import CDataset, CDatasetHeader
from secml.data.loader import CDataLoader
from secml.data.loader.loader_utils import resize_img, crop_img
from secml.utils import fm
from secml.utils.download_utils import dl_file, md5

# Folder where all iCubWorld dataset will be stored
ICUBWORLD_PATH = fm.join(settings.SECML_DS_DIR, 'iCubWorld')

# iCubWorld28
ICUBWORLD28_URL = \
    'https://data.mendeley.com/datasets/3n2vh9rdxd/1/files/' \
    '9e3a79ef-18d9-4c37-b76c-0c34ead60544/iCubWorld28_128x128.zip?dl=1'
ICUBWORLD28_MD5 = 'd4fcdd02bdb0054688a213611a7a8ae7'
ICUBWORLD28_PATH = fm.join(ICUBWORLD_PATH, 'iCubWorld28')


# TODO: iCubWorld 1.0
# TODO: Hello iCubWorld
# TODO: iCubWorld Transformations


[docs]class CDataLoaderICubWorld(CDataLoader, metaclass=ABCMeta):
    """Interface for loaders of iCubWorld datasets.

    iCubWorld is a set of computer vision datasets for robotic applications,
    developed by Istituto Italiano di Tecnologia (IIT), Genova, Italy.

    REF: https://robotology.github.io/iCubWorld

    """

[docs]    @abstractmethod
    def load(self, *args, **kwargs):
        """Loads a dataset.

        This method should return a `.CDataset` object.

        """
        raise NotImplementedError


[docs]class CDataLoaderICubWorld28(CDataLoaderICubWorld):
    """Loader for iCubWorld28 dataset.

    The dataset consists in 28 objects divided in 7 categories,
    where each category includes 4 objects. For each object there are 4
    different acquisition days for training and 4 for testing, with ~150
    frames per acquisition.

    Attributes
    ----------
    class_type : 'icubworld28'

    """
    __class_type = 'icubworld28'
    __lock = Lock()  # Lock to prevent multiple parallel download/extraction

    def __init__(self):

        self._train_path = fm.join(ICUBWORLD28_PATH, 'train')
        self._test_path = fm.join(ICUBWORLD28_PATH, 'test')

        with CDataLoaderICubWorld28.__lock:
            # Download (if needed) data and extract it
            if not fm.folder_exist(self._train_path) \
                    or not fm.folder_exist(self._test_path):
                self._get_data(ICUBWORLD28_URL, ICUBWORLD28_PATH)

[docs]    def load(self, ds_type, day='day4', icub7=False,
             resize_shape=(128, 128), crop_shape=None, normalize=True):
        """Load the dataset.

        The pre-cropped version of the images is loaded, with size 128 x 128.
        An additional resize/crop shape could be passed as input if needed.

        Extra dataset attributes:
          - 'img_w', 'img_h': size of the images in pixels.
          - 'y_orig': CArray with the original labels of the objects.

        Parameters
        ----------
        ds_type : str
            Identifier of the dataset to download, either 'train' or 'test'.
        day : str, optional
            Acquisition day from which to load the images. Default 'day4'.
            The available options are: 'day1', 'day2', 'day3', 'day4'.
        icub7 : bool or int, optional
            If True, load a reduced dataset with 7 objects by
            taking the 3rd object for each category. Default False.
            If int, the Nth object for each category will be loaded.
        resize_shape : tuple, optional
           Images will be resized to (height, width) shape. Default (128, 128).
        crop_shape : tuple or None, optional
            If a tuple, a crop of (height, width) shape will be extracted
            from the center of each image. Default None.
        normalize : bool, optional
            If True, images are normalized between 0-1. Default True.

        Returns
        -------
        CDataset
            Output dataset.

        """
        if ds_type == 'train':
            data_path = self._train_path
        elif ds_type == 'test':
            data_path = self._test_path
        else:
            raise ValueError("use ds_type = {'train', 'test'}.")

        day_path = fm.join(data_path, day)
        if not fm.folder_exist(day_path):
            raise ValueError("{:} not available.".format(day))

        self.logger.info(
            "Loading iCubWorld{:} {:} {:} dataset from {:}".format(
                '7' if icub7 else '28', day, ds_type, day_path))

        icub7 = 3 if icub7 is True else icub7  # Use the 3rd sub-obj by default

        x = None
        y_orig = []
        for obj in sorted(fm.listdir(day_path)):  # Objects (cup, sponge, ..)

            obj_path = fm.join(day_path, obj)

            # Sub-objects (cup1, cup2, ...)
            for sub_obj in sorted(fm.listdir(obj_path)):

                if icub7 and sub_obj[-1] != str(icub7):
                    continue  # Load only the `icub7`th object

                self.logger.debug("Loading images for {:}".format(sub_obj))

                sub_obj_path = fm.join(obj_path, sub_obj)

                for f in sorted(fm.listdir(sub_obj_path)):

                    img = Image.open(fm.join(sub_obj_path, f))

                    if resize_shape is not None:
                        img = resize_img(img, resize_shape)
                    if crop_shape is not None:
                        img = crop_img(img, crop_shape)

                    img = CArray(img.getdata(), dtype='uint8').ravel()
                    x = x.append(img, axis=0) if x is not None else img

                    y_orig.append(sub_obj)  # Label is given by sub-obj name

        # Create the int-based array of labels. Keep original labels in y_orig
        y_orig = CArray(y_orig)
        y = CArray(y_orig).unique(return_inverse=True)[1]

        if normalize is True:
            x /= 255.0

        # Size of images is the crop shape (if any) otherwise, the resize shape
        img_h, img_w = crop_shape if crop_shape is not None else resize_shape

        header = CDatasetHeader(img_w=img_w, img_h=img_h, y_orig=y_orig)

        return CDataset(x, y, header=header)

    def _get_data(self, file_url, dl_folder):
        """Download input datafile, unzip and store in output_path.

        Parameters
        ----------
        file_url : str
            URL of the file to download.
        dl_folder : str
            Path to the folder where to store the downloaded file.

        """
        f_dl = fm.join(dl_folder, 'iCubWorld28_128x128.zip?dl=1')
        if not fm.file_exist(f_dl) or md5(f_dl) != ICUBWORLD28_MD5:
            # Generate the full path to the downloaded file
            f_dl = dl_file(file_url, dl_folder, md5_digest=ICUBWORLD28_MD5)

        self.logger.info("Extracting files...")

        # Extract the content of downloaded file
        zipfile.ZipFile(f_dl, 'r').extractall(dl_folder)
        # Remove downloaded file
        fm.remove_file(f_dl)

        # iCubWorld28 zip file contains a macosx private folder, clean it up
        if fm.folder_exist(fm.join(ICUBWORLD28_PATH, '__MACOSX')):
            fm.remove_folder(fm.join(ICUBWORLD28_PATH, '__MACOSX'), force=True)

        # iCubWorld28 zip file contains a macosx private files, clean it up
        for dirpath, dirnames, filenames in os.walk(ICUBWORLD28_PATH):
            for file in filenames:
                if fnmatch(file, '.DS_Store'):
                    fm.remove_file(fm.join(dirpath, file))

        # Now move all data to an upper folder if needed
        if not fm.folder_exist(self._train_path) \
                or not fm.folder_exist(self._test_path):
            sub_d = fm.join(dl_folder, fm.listdir(dl_folder)[0])
            for e in fm.listdir(sub_d):
                e_full = fm.join(sub_d, e)  # Full path to current element
                try:  # Call copy_file or copy_folder when applicable
                    if fm.file_exist(e_full) is True:
                        fm.copy_file(e_full, dl_folder)
                    elif fm.folder_exist(e_full) is True:
                        fm.copy_folder(e_full, fm.join(dl_folder, e))
                except:
                    pass

            # Check that the main dataset file is now in the correct folder
            if not fm.folder_exist(self._train_path) \
                    or not fm.folder_exist(self._test_path):
                raise RuntimeError("dataset main file not available!")

            # The subdirectory can now be removed
            fm.remove_folder(sub_d, force=True)