Unverified Commit 08c8f0e0 authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Merge mock data preparation and dataset logic in prototype tests (#6010)

* merge mock data preparation and loading

* address comments

* fix extra file creation

* remove tmp folder

* inline images meta creation in coco mock data
parent d9a69506
...@@ -10,19 +10,18 @@ import lzma ...@@ -10,19 +10,18 @@ import lzma
import pathlib import pathlib
import pickle import pickle
import random import random
import shutil
import unittest.mock import unittest.mock
import warnings import warnings
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from collections import defaultdict, Counter from collections import defaultdict, Counter
import numpy as np import numpy as np
import PIL.Image
import pytest import pytest
import torch import torch
from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid
from torch.nn.functional import one_hot from torch.nn.functional import one_hot
from torch.testing import make_tensor as _make_tensor from torch.testing import make_tensor as _make_tensor
from torchvision._utils import sequence_to_str
from torchvision.prototype import datasets from torchvision.prototype import datasets
make_tensor = functools.partial(_make_tensor, device="cpu") make_tensor = functools.partial(_make_tensor, device="cpu")
...@@ -62,27 +61,51 @@ class DatasetMock: ...@@ -62,27 +61,51 @@ class DatasetMock:
return mock_info return mock_info
def prepare(self, config): def load(self, config):
# `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
# test/test_prototype_builtin_datasets.py # test/test_prototype_builtin_datasets.py
root = pathlib.Path(datasets.home()) / self.name root = pathlib.Path(datasets.home()) / self.name
root.mkdir(exist_ok=True) # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
# this will only download **and** preprocess if the file is not present. In other words, if we already place
# the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
# To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
# `root` only when it is requested.
tmp_mock_data_folder = root / "__mock__"
tmp_mock_data_folder.mkdir(parents=True)
mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
def patched_download(resource, root, **kwargs):
src = tmp_mock_data_folder / resource.file_name
if not src.exists():
raise pytest.UsageError(
f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
f"but it was not created by the mock data function."
)
mock_info = self._parse_mock_info(self.mock_data_fn(root, config)) dst = root / resource.file_name
shutil.move(str(src), str(root))
with unittest.mock.patch.object(datasets.utils.Dataset, "__init__"): return dst
required_file_names = {
resource.file_name for resource in datasets.load(self.name, root=root, **config)._resources() with unittest.mock.patch(
} "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
available_file_names = {path.name for path in root.glob("*")} ):
missing_file_names = required_file_names - available_file_names dataset = datasets.load(self.name, **config)
if missing_file_names:
extra_files = list(tmp_mock_data_folder.glob("**/*"))
if extra_files:
raise pytest.UsageError( raise pytest.UsageError(
f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} " (
f"for {config}, but they were not created by the mock data function." f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
f"but they were not loaded:\n\n"
)
+ "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
) )
return mock_info tmp_mock_data_folder.rmdir()
return dataset, mock_info
def config_id(name, config): def config_id(name, config):
...@@ -513,22 +536,6 @@ def imagenet(root, config): ...@@ -513,22 +536,6 @@ def imagenet(root, config):
class CocoMockData: class CocoMockData:
@classmethod
def _make_images_archive(cls, root, name, *, num_samples):
image_paths = create_image_folder(
root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
)
images_meta = []
for path in image_paths:
with PIL.Image.open(path) as image:
width, height = image.size
images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))
make_zip(root, f"{name}.zip")
return images_meta
@classmethod @classmethod
def _make_annotations_json( def _make_annotations_json(
cls, cls,
...@@ -596,16 +603,38 @@ class CocoMockData: ...@@ -596,16 +603,38 @@ class CocoMockData:
cls, cls,
root, root,
*, *,
split,
year, year,
num_samples, num_samples,
): ):
annotations_dir = root / "annotations" annotations_dir = root / "annotations"
annotations_dir.mkdir() annotations_dir.mkdir()
for split in ("train", "val"): for split_ in ("train", "val"):
config_name = f"{split}{year}" config_name = f"{split_}{year}"
images_meta = [
dict(
file_name=f"{idx:012d}.jpg",
id=idx,
width=width,
height=height,
)
for idx, (height, width) in enumerate(
torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
)
]
if split_ == split:
create_image_folder(
root,
config_name,
file_name_fn=lambda idx: images_meta[idx]["file_name"],
num_examples=num_samples,
size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
)
make_zip(root, f"{config_name}.zip")
images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
cls._make_annotations( cls._make_annotations(
annotations_dir, annotations_dir,
config_name, config_name,
...@@ -625,7 +654,7 @@ class CocoMockData: ...@@ -625,7 +654,7 @@ class CocoMockData:
) )
) )
def coco(root, config): def coco(root, config):
return CocoMockData.generate(root, year=config["year"], num_samples=5) return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
class SBDMockData: class SBDMockData:
...@@ -799,8 +828,11 @@ class VOCMockData: ...@@ -799,8 +828,11 @@ class VOCMockData:
def generate(cls, root, *, year, trainval): def generate(cls, root, *, year, trainval):
archive_folder = root archive_folder = root
if year == "2011": if year == "2011":
archive_folder /= "TrainVal" archive_folder = root / "TrainVal"
data_folder = archive_folder / "VOCdevkit" / f"VOC{year}" data_folder = archive_folder / "VOCdevkit"
else:
archive_folder = data_folder = root / "VOCdevkit"
data_folder = data_folder / f"VOC{year}"
data_folder.mkdir(parents=True, exist_ok=True) data_folder.mkdir(parents=True, exist_ok=True)
ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval) ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
...@@ -810,7 +842,7 @@ class VOCMockData: ...@@ -810,7 +842,7 @@ class VOCMockData:
(cls._make_detection_anns_folder, "Annotations", ".xml"), (cls._make_detection_anns_folder, "Annotations", ".xml"),
]: ]:
make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids)) make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder) make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
return num_samples_map return num_samples_map
...@@ -1091,8 +1123,10 @@ def gtsrb(root, config): ...@@ -1091,8 +1123,10 @@ def gtsrb(root, config):
} }
) )
archive_folder = root / "GTSRB"
if config["split"] == "train": if config["split"] == "train":
train_folder = root / "GTSRB" / "Training" train_folder = archive_folder / "Training"
train_folder.mkdir(parents=True) train_folder.mkdir(parents=True)
for class_idx in classes: for class_idx in classes:
...@@ -1107,9 +1141,9 @@ def gtsrb(root, config): ...@@ -1107,9 +1141,9 @@ def gtsrb(root, config):
num_examples=num_examples_per_class, num_examples=num_examples_per_class,
class_idx=int(class_idx), class_idx=int(class_idx),
) )
make_zip(root, "GTSRB-Training_fixed.zip", train_folder) make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
else: else:
test_folder = root / "GTSRB" / "Final_Test" test_folder = archive_folder / "Final_Test"
test_folder.mkdir(parents=True) test_folder.mkdir(parents=True)
create_image_folder( create_image_folder(
...@@ -1119,7 +1153,7 @@ def gtsrb(root, config): ...@@ -1119,7 +1153,7 @@ def gtsrb(root, config):
num_examples=num_examples, num_examples=num_examples,
) )
make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder) make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
_make_ann_file( _make_ann_file(
path=root / "GT-final_test.csv", path=root / "GT-final_test.csv",
...@@ -1484,11 +1518,10 @@ def stanford_cars(root, config): ...@@ -1484,11 +1518,10 @@ def stanford_cars(root, config):
num_samples = {"train": 5, "test": 7}[split] num_samples = {"train": 5, "test": 7}[split]
num_categories = 3 num_categories = 3
devkit = root / "devkit"
devkit.mkdir(parents=True)
if split == "train": if split == "train":
images_folder_name = "cars_train" images_folder_name = "cars_train"
devkit = root / "devkit"
devkit.mkdir()
annotations_mat_path = devkit / "cars_train_annos.mat" annotations_mat_path = devkit / "cars_train_annos.mat"
else: else:
images_folder_name = "cars_test" images_folder_name = "cars_test"
......
...@@ -56,18 +56,14 @@ class TestCommon: ...@@ -56,18 +56,14 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_smoke(self, dataset_mock, config): def test_smoke(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
if not isinstance(dataset, datasets.utils.Dataset): if not isinstance(dataset, datasets.utils.Dataset):
raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.") raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_sample(self, dataset_mock, config): def test_sample(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
try: try:
sample = next(iter(dataset)) sample = next(iter(dataset))
...@@ -84,17 +80,13 @@ class TestCommon: ...@@ -84,17 +80,13 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_num_samples(self, dataset_mock, config): def test_num_samples(self, dataset_mock, config):
mock_info = dataset_mock.prepare(config) dataset, mock_info = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
assert len(list(dataset)) == mock_info["num_samples"] assert len(list(dataset)) == mock_info["num_samples"]
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_no_vanilla_tensors(self, dataset_mock, config): def test_no_vanilla_tensors(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor} vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
if vanilla_tensors: if vanilla_tensors:
...@@ -105,24 +97,20 @@ class TestCommon: ...@@ -105,24 +97,20 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_transformable(self, dataset_mock, config): def test_transformable(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
next(iter(dataset.map(transforms.Identity()))) next(iter(dataset.map(transforms.Identity())))
@pytest.mark.parametrize("only_datapipe", [False, True]) @pytest.mark.parametrize("only_datapipe", [False, True])
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_traversable(self, dataset_mock, config, only_datapipe): def test_traversable(self, dataset_mock, config, only_datapipe):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
traverse(dataset, only_datapipe=only_datapipe) traverse(dataset, only_datapipe=only_datapipe)
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_serializable(self, dataset_mock, config): def test_serializable(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
pickle.dumps(dataset) pickle.dumps(dataset)
...@@ -135,8 +123,7 @@ class TestCommon: ...@@ -135,8 +123,7 @@ class TestCommon:
@pytest.mark.parametrize("num_workers", [0, 1]) @pytest.mark.parametrize("num_workers", [0, 1])
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_data_loader(self, dataset_mock, config, num_workers): def test_data_loader(self, dataset_mock, config, num_workers):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
dl = DataLoader( dl = DataLoader(
dataset, dataset,
...@@ -153,17 +140,15 @@ class TestCommon: ...@@ -153,17 +140,15 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
@pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter)) @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
def test_has_annotations(self, dataset_mock, config, annotation_dp_type): def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
dataset, _ = dataset_mock.load(config)
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)): if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.") raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_save_load(self, dataset_mock, config): def test_save_load(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
sample = next(iter(dataset)) sample = next(iter(dataset))
with io.BytesIO() as buffer: with io.BytesIO() as buffer:
...@@ -173,8 +158,7 @@ class TestCommon: ...@@ -173,8 +158,7 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_infinite_buffer_size(self, dataset_mock, config): def test_infinite_buffer_size(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
for dp in extract_datapipes(dataset): for dp in extract_datapipes(dataset):
if hasattr(dp, "buffer_size"): if hasattr(dp, "buffer_size"):
...@@ -184,8 +168,7 @@ class TestCommon: ...@@ -184,8 +168,7 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS) @parametrize_dataset_mocks(DATASET_MOCKS)
def test_has_length(self, dataset_mock, config): def test_has_length(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
assert len(dataset) > 0 assert len(dataset) > 0
...@@ -193,9 +176,7 @@ class TestCommon: ...@@ -193,9 +176,7 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"]) @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
class TestQMNIST: class TestQMNIST:
def test_extra_label(self, dataset_mock, config): def test_extra_label(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
sample = next(iter(dataset)) sample = next(iter(dataset))
for key, type in ( for key, type in (
...@@ -218,9 +199,7 @@ class TestGTSRB: ...@@ -218,9 +199,7 @@ class TestGTSRB:
if config["split"] != "train": if config["split"] != "train":
return return
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
for sample in dataset: for sample in dataset:
label_from_path = int(Path(sample["path"]).parent.name) label_from_path = int(Path(sample["path"]).parent.name)
...@@ -230,9 +209,7 @@ class TestGTSRB: ...@@ -230,9 +209,7 @@ class TestGTSRB:
@parametrize_dataset_mocks(DATASET_MOCKS["usps"]) @parametrize_dataset_mocks(DATASET_MOCKS["usps"])
class TestUSPS: class TestUSPS:
def test_sample_content(self, dataset_mock, config): def test_sample_content(self, dataset_mock, config):
dataset_mock.prepare(config) dataset, _ = dataset_mock.load(config)
dataset = datasets.load(dataset_mock.name, **config)
for sample in dataset: for sample in dataset:
assert "image" in sample assert "image" in sample
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment