Add Flowers102 dataset (#5177)

* Add Flowers102 datasets * Fix initialization of images and labels * Fix _check_exists in Flowers102 * Add Flowers102 to datasets and docs * Add Flowers102TestCase to unittest * Fixing Python type statically * Shuffle the fake labels * Update test/test_datasets.py Co-authored-by: Philip Meier <github.pmeier@posteo.de> * Apply the suggestions by pmeier * Use check_integrity to check file existence * Save the labels to base_folder * Minor fixes * Using a loop makes this more concise without reducing readability Co-authored-by: Philip Meier <github.pmeier@posteo.de> * Using a loop makes this more concise without reducing readability Co-authored-by: Philip Meier <github.pmeier@posteo.de> * Remove self.labels and self.label_to_index attributes * minor simplification * Check the exitence of image folder * Revert the check * Check the existence of image folder * valid -> val * keep some stuff private * minor doc arrangements * remove default FEATURE_TYPES * Simplify the datasets existence * check if the image folder exists Co-authored-by: Philip Meier <github.pmeier@posteo.de> * isdir -> is_dir Co-authored-by: Philip Meier <github.pmeier@posteo.de> Co-authored-by: Nicolas Hug <nicolashug@fb.com>

Add Flowers102 dataset (#5177)
* Add Flowers102 datasets * Fix initialization of images and labels * Fix _check_exists in Flowers102 * Add Flowers102 to datasets and docs * Add Flowers102TestCase to unittest * Fixing Python type statically * Shuffle the fake labels * Update test/test_datasets.py Co-authored-by: Philip Meier <github.pmeier@posteo.de> * Apply the suggestions by pmeier * Use check_integrity to check file existence * Save the labels to base_folder * Minor fixes * Using a loop makes this more concise without reducing readability Co-authored-by: Philip Meier <github.pmeier@posteo.de> * Using a loop makes this more concise without reducing readability Co-authored-by: Philip Meier <github.pmeier@posteo.de> * Remove self.labels and self.label_to_index attributes * minor simplification * Check the exitence of image folder * Revert the check * Check the existence of image folder * valid -> val * keep some stuff private * minor doc arrangements * remove default FEATURE_TYPES * Simplify the datasets existence * check if the image folder exists Co-authored-by: Philip Meier <github.pmeier@posteo.de> * isdir -> is_dir Co-authored-by: Philip Meier <github.pmeier@posteo.de> Co-authored-by: Nicolas Hug <nicolashug@fb.com>
a8f2dedb · Zhiqiang Wang · GitHub · 1c630960 · a8f2dedb · a8f2dedb
Unverified Commit a8f2dedb authored Jan 12, 2022 by Zhiqiang Wang Committed by GitHub Jan 12, 2022
4 changed files
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -46,6 +46,7 @@ You can also create your own datasets using the provided :ref:`base classes <bas
    FER2013
    Flickr8k
    Flickr30k
+    Flowers102
    FlyingChairs
    FlyingThings3D
    Food101

--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2490,5 +2490,41 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase):
        return num_examples * len(classes)


+class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Flowers102
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    REQUIRED_PACKAGES = ("scipy",)
+
+    def inject_fake_data(self, tmpdir: str, config):
+        base_folder = pathlib.Path(tmpdir) / "flowers-102"
+
+        num_classes = 3
+        num_images_per_split = dict(train=5, val=4, test=3)
+        num_images_total = sum(num_images_per_split.values())
+        datasets_utils.create_image_folder(
+            base_folder,
+            "jpg",
+            file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg",
+            num_examples=num_images_total,
+        )
+
+        label_dict = dict(
+            labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8),
+        )
+        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict)
+
+        setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16)
+        np.random.shuffle(setid_mat)
+        setid_dict = dict(
+            trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1),
+            valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1),
+            tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1),
+        )
+        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict)
+
+        return num_images_per_split[config["split"]]
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -10,6 +10,7 @@ from .dtd import DTD
 from .fakedata import FakeData
 from .fer2013 import FER2013
 from .flickr import Flickr8k, Flickr30k
+from .flowers102 import Flowers102
 from .folder import ImageFolder, DatasetFolder
 from .food101 import Food101
 from .gtsrb import GTSRB
@@ -61,6 +62,7 @@ __all__ = (
    "SBU",
    "Flickr8k",
    "Flickr30k",
+    "Flowers102",
    "VOCSegmentation",
    "VOCDetection",
    "Cityscapes",

--- a/torchvision/datasets/flowers102.py
+++ b/torchvision/datasets/flowers102.py
+from pathlib import Path
+from typing import Any, Tuple, Callable, Optional
+
+import PIL.Image
+
+from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
+from .vision import VisionDataset
+
+
+class Flowers102(VisionDataset):
+    """`Oxford 102 Flower <https://www.robots.ox.ac.uk/~vgg/data/flowers/102/>`_ Dataset.
+
+    .. warning::
+
+        This class needs `scipy <https://docs.scipy.org/doc/>`_ to load target files from `.mat` format.
+
+    Oxford 102 Flower is an image classification dataset consisting of 102 flower categories. The
+    flowers were chosen to be flowers commonly occurring in the United Kingdom. Each class consists of
+    between 40 and 258 images.
+
+    The images have large scale, pose and light variations. In addition, there are categories that
+    have large variations within the category, and several very similar categories.
+
+    Args:
+        root (string): Root directory of the dataset.
+        split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+        transform (callable, optional): A function/transform that takes in an PIL image and returns a
+            transformed version. E.g, ``transforms.RandomCrop``.
+        target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+    """
+
+    _download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/"
+    _file_dict = {  # filename, md5
+        "image": ("102flowers.tgz", "52808999861908f626f3c1f4e79d11fa"),
+        "label": ("imagelabels.mat", "e0620be6f572b9609742df49c70aed4d"),
+        "setid": ("setid.mat", "a5357ecc9cb78c4bef273ce3793fc85c"),
+    }
+    _splits_map = {"train": "trnid", "val": "valid", "test": "tstid"}
+
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+        download: bool = True,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transform=transform, target_transform=target_transform)
+        self._split = verify_str_arg(split, "split", ("train", "val", "test"))
+        self._base_folder = Path(self.root) / "flowers-102"
+        self._images_folder = self._base_folder / "jpg"
+
+        if download:
+            self.download()
+
+        if not self._check_integrity():
+            raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")
+
+        from scipy.io import loadmat
+
+        set_ids = loadmat(self._base_folder / self._file_dict["setid"][0], squeeze_me=True)
+        image_ids = set_ids[self._splits_map[self._split]].tolist()
+
+        labels = loadmat(self._base_folder / self._file_dict["label"][0], squeeze_me=True)
+        image_id_to_label = dict(enumerate(labels["labels"].tolist(), 1))
+
+        self._labels = []
+        self._image_files = []
+        for image_id in image_ids:
+            self._labels.append(image_id_to_label[image_id])
+            self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg")
+
+    def __len__(self) -> int:
+        return len(self._image_files)
+
+    def __getitem__(self, idx) -> Tuple[Any, Any]:
+        image_file, label = self._image_files[idx], self._labels[idx]
+        image = PIL.Image.open(image_file).convert("RGB")
+
+        if self.transform:
+            image = self.transform(image)
+
+        if self.target_transform:
+            label = self.target_transform(label)
+
+        return image, label
+
+    def extra_repr(self) -> str:
+        return f"split={self._split}"
+
+    def _check_integrity(self):
+        if not (self._images_folder.exists() and self._images_folder.is_dir()):
+            return False
+
+        for id in ["label", "setid"]:
+            filename, md5 = self._file_dict[id]
+            if not check_integrity(str(self._base_folder / filename), md5):
+                return False
+        return True
+
+    def download(self):
+        if self._check_integrity():
+            return
+        download_and_extract_archive(
+            f"{self._download_url_prefix}{self._file_dict['image'][0]}",
+            str(self._base_folder),
+            md5=self._file_dict["image"][1],
+        )
+        for id in ["label", "setid"]:
+            filename, md5 = self._file_dict[id]
+            download_url(self._download_url_prefix + filename, str(self._base_folder), md5=md5)