Unverified Commit a8f2dedb authored by Zhiqiang Wang's avatar Zhiqiang Wang Committed by GitHub
Browse files

Add Flowers102 dataset (#5177)



* Add Flowers102 datasets

* Fix initialization of images and labels

* Fix _check_exists in Flowers102

* Add Flowers102 to datasets and docs

* Add Flowers102TestCase to unittest

* Fixing Python type statically

* Shuffle the fake labels

* Update test/test_datasets.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Apply the suggestions by pmeier

* Use check_integrity to check file existence

* Save the labels to base_folder

* Minor fixes

* Using a loop makes this more concise without reducing readability
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Using a loop makes this more concise without reducing readability
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Remove self.labels and self.label_to_index attributes

* minor simplification

* Check the exitence of image folder

* Revert the check

* Check the existence of image folder

* valid -> val

* keep some stuff private

* minor doc arrangements

* remove default FEATURE_TYPES

* Simplify the datasets existence

* check if the image folder exists
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* isdir -> is_dir
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>
Co-authored-by: default avatarNicolas Hug <nicolashug@fb.com>
parent 1c630960
......@@ -46,6 +46,7 @@ You can also create your own datasets using the provided :ref:`base classes <bas
FER2013
Flickr8k
Flickr30k
Flowers102
FlyingChairs
FlyingThings3D
Food101
......
......@@ -2490,5 +2490,41 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase):
return num_examples * len(classes)
class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
DATASET_CLASS = datasets.Flowers102
ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
REQUIRED_PACKAGES = ("scipy",)
def inject_fake_data(self, tmpdir: str, config):
base_folder = pathlib.Path(tmpdir) / "flowers-102"
num_classes = 3
num_images_per_split = dict(train=5, val=4, test=3)
num_images_total = sum(num_images_per_split.values())
datasets_utils.create_image_folder(
base_folder,
"jpg",
file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg",
num_examples=num_images_total,
)
label_dict = dict(
labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8),
)
datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict)
setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16)
np.random.shuffle(setid_mat)
setid_dict = dict(
trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1),
valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1),
tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1),
)
datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict)
return num_images_per_split[config["split"]]
if __name__ == "__main__":
unittest.main()
......@@ -10,6 +10,7 @@ from .dtd import DTD
from .fakedata import FakeData
from .fer2013 import FER2013
from .flickr import Flickr8k, Flickr30k
from .flowers102 import Flowers102
from .folder import ImageFolder, DatasetFolder
from .food101 import Food101
from .gtsrb import GTSRB
......@@ -61,6 +62,7 @@ __all__ = (
"SBU",
"Flickr8k",
"Flickr30k",
"Flowers102",
"VOCSegmentation",
"VOCDetection",
"Cityscapes",
......
from pathlib import Path
from typing import Any, Tuple, Callable, Optional
import PIL.Image
from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
from .vision import VisionDataset
class Flowers102(VisionDataset):
"""`Oxford 102 Flower <https://www.robots.ox.ac.uk/~vgg/data/flowers/102/>`_ Dataset.
.. warning::
This class needs `scipy <https://docs.scipy.org/doc/>`_ to load target files from `.mat` format.
Oxford 102 Flower is an image classification dataset consisting of 102 flower categories. The
flowers were chosen to be flowers commonly occurring in the United Kingdom. Each class consists of
between 40 and 258 images.
The images have large scale, pose and light variations. In addition, there are categories that
have large variations within the category, and several very similar categories.
Args:
root (string): Root directory of the dataset.
split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
download (bool, optional): If true, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
transform (callable, optional): A function/transform that takes in an PIL image and returns a
transformed version. E.g, ``transforms.RandomCrop``.
target_transform (callable, optional): A function/transform that takes in the target and transforms it.
"""
_download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/"
_file_dict = { # filename, md5
"image": ("102flowers.tgz", "52808999861908f626f3c1f4e79d11fa"),
"label": ("imagelabels.mat", "e0620be6f572b9609742df49c70aed4d"),
"setid": ("setid.mat", "a5357ecc9cb78c4bef273ce3793fc85c"),
}
_splits_map = {"train": "trnid", "val": "valid", "test": "tstid"}
def __init__(
self,
root: str,
split: str = "train",
download: bool = True,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
) -> None:
super().__init__(root, transform=transform, target_transform=target_transform)
self._split = verify_str_arg(split, "split", ("train", "val", "test"))
self._base_folder = Path(self.root) / "flowers-102"
self._images_folder = self._base_folder / "jpg"
if download:
self.download()
if not self._check_integrity():
raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")
from scipy.io import loadmat
set_ids = loadmat(self._base_folder / self._file_dict["setid"][0], squeeze_me=True)
image_ids = set_ids[self._splits_map[self._split]].tolist()
labels = loadmat(self._base_folder / self._file_dict["label"][0], squeeze_me=True)
image_id_to_label = dict(enumerate(labels["labels"].tolist(), 1))
self._labels = []
self._image_files = []
for image_id in image_ids:
self._labels.append(image_id_to_label[image_id])
self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg")
def __len__(self) -> int:
return len(self._image_files)
def __getitem__(self, idx) -> Tuple[Any, Any]:
image_file, label = self._image_files[idx], self._labels[idx]
image = PIL.Image.open(image_file).convert("RGB")
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
return image, label
def extra_repr(self) -> str:
return f"split={self._split}"
def _check_integrity(self):
if not (self._images_folder.exists() and self._images_folder.is_dir()):
return False
for id in ["label", "setid"]:
filename, md5 = self._file_dict[id]
if not check_integrity(str(self._base_folder / filename), md5):
return False
return True
def download(self):
if self._check_integrity():
return
download_and_extract_archive(
f"{self._download_url_prefix}{self._file_dict['image'][0]}",
str(self._base_folder),
md5=self._file_dict["image"][1],
)
for id in ["label", "setid"]:
filename, md5 = self._file_dict[id]
download_url(self._download_url_prefix + filename, str(self._base_folder), md5=md5)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment