Unverified Commit 08c8f0e0 authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Merge mock data preparation and dataset logic in prototype tests (#6010)

* merge mock data preparation and loading

* address comments

* fix extra file creation

* remove tmp folder

* inline images meta creation in coco mock data
parent d9a69506
......@@ -10,19 +10,18 @@ import lzma
import pathlib
import pickle
import random
import shutil
import unittest.mock
import warnings
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
import numpy as np
import PIL.Image
import pytest
import torch
from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid
from torch.nn.functional import one_hot
from torch.testing import make_tensor as _make_tensor
from torchvision._utils import sequence_to_str
from torchvision.prototype import datasets
make_tensor = functools.partial(_make_tensor, device="cpu")
......@@ -62,27 +61,51 @@ class DatasetMock:
return mock_info
def prepare(self, config):
def load(self, config):
# `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
# test/test_prototype_builtin_datasets.py
root = pathlib.Path(datasets.home()) / self.name
root.mkdir(exist_ok=True)
# We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
# this will only download **and** preprocess if the file is not present. In other words, if we already place
# the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
# To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
# `root` only when it is requested.
tmp_mock_data_folder = root / "__mock__"
tmp_mock_data_folder.mkdir(parents=True)
mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
def patched_download(resource, root, **kwargs):
src = tmp_mock_data_folder / resource.file_name
if not src.exists():
raise pytest.UsageError(
f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
f"but it was not created by the mock data function."
)
mock_info = self._parse_mock_info(self.mock_data_fn(root, config))
dst = root / resource.file_name
shutil.move(str(src), str(root))
with unittest.mock.patch.object(datasets.utils.Dataset, "__init__"):
required_file_names = {
resource.file_name for resource in datasets.load(self.name, root=root, **config)._resources()
}
available_file_names = {path.name for path in root.glob("*")}
missing_file_names = required_file_names - available_file_names
if missing_file_names:
return dst
with unittest.mock.patch(
"torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
):
dataset = datasets.load(self.name, **config)
extra_files = list(tmp_mock_data_folder.glob("**/*"))
if extra_files:
raise pytest.UsageError(
f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} "
f"for {config}, but they were not created by the mock data function."
(
f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
f"but they were not loaded:\n\n"
)
+ "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
)
return mock_info
tmp_mock_data_folder.rmdir()
return dataset, mock_info
def config_id(name, config):
......@@ -513,22 +536,6 @@ def imagenet(root, config):
class CocoMockData:
@classmethod
def _make_images_archive(cls, root, name, *, num_samples):
image_paths = create_image_folder(
root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
)
images_meta = []
for path in image_paths:
with PIL.Image.open(path) as image:
width, height = image.size
images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))
make_zip(root, f"{name}.zip")
return images_meta
@classmethod
def _make_annotations_json(
cls,
......@@ -596,16 +603,38 @@ class CocoMockData:
cls,
root,
*,
split,
year,
num_samples,
):
annotations_dir = root / "annotations"
annotations_dir.mkdir()
for split in ("train", "val"):
config_name = f"{split}{year}"
for split_ in ("train", "val"):
config_name = f"{split_}{year}"
images_meta = [
dict(
file_name=f"{idx:012d}.jpg",
id=idx,
width=width,
height=height,
)
for idx, (height, width) in enumerate(
torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
)
]
if split_ == split:
create_image_folder(
root,
config_name,
file_name_fn=lambda idx: images_meta[idx]["file_name"],
num_examples=num_samples,
size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
)
make_zip(root, f"{config_name}.zip")
images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
cls._make_annotations(
annotations_dir,
config_name,
......@@ -625,7 +654,7 @@ class CocoMockData:
)
)
def coco(root, config):
return CocoMockData.generate(root, year=config["year"], num_samples=5)
return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
class SBDMockData:
......@@ -799,8 +828,11 @@ class VOCMockData:
def generate(cls, root, *, year, trainval):
archive_folder = root
if year == "2011":
archive_folder /= "TrainVal"
data_folder = archive_folder / "VOCdevkit" / f"VOC{year}"
archive_folder = root / "TrainVal"
data_folder = archive_folder / "VOCdevkit"
else:
archive_folder = data_folder = root / "VOCdevkit"
data_folder = data_folder / f"VOC{year}"
data_folder.mkdir(parents=True, exist_ok=True)
ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
......@@ -810,7 +842,7 @@ class VOCMockData:
(cls._make_detection_anns_folder, "Annotations", ".xml"),
]:
make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder)
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
return num_samples_map
......@@ -1091,8 +1123,10 @@ def gtsrb(root, config):
}
)
archive_folder = root / "GTSRB"
if config["split"] == "train":
train_folder = root / "GTSRB" / "Training"
train_folder = archive_folder / "Training"
train_folder.mkdir(parents=True)
for class_idx in classes:
......@@ -1107,9 +1141,9 @@ def gtsrb(root, config):
num_examples=num_examples_per_class,
class_idx=int(class_idx),
)
make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
else:
test_folder = root / "GTSRB" / "Final_Test"
test_folder = archive_folder / "Final_Test"
test_folder.mkdir(parents=True)
create_image_folder(
......@@ -1119,7 +1153,7 @@ def gtsrb(root, config):
num_examples=num_examples,
)
make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
_make_ann_file(
path=root / "GT-final_test.csv",
......@@ -1484,11 +1518,10 @@ def stanford_cars(root, config):
num_samples = {"train": 5, "test": 7}[split]
num_categories = 3
devkit = root / "devkit"
devkit.mkdir(parents=True)
if split == "train":
images_folder_name = "cars_train"
devkit = root / "devkit"
devkit.mkdir()
annotations_mat_path = devkit / "cars_train_annos.mat"
else:
images_folder_name = "cars_test"
......
......@@ -56,18 +56,14 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_smoke(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
if not isinstance(dataset, datasets.utils.Dataset):
raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_sample(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
try:
sample = next(iter(dataset))
......@@ -84,17 +80,13 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_num_samples(self, dataset_mock, config):
mock_info = dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, mock_info = dataset_mock.load(config)
assert len(list(dataset)) == mock_info["num_samples"]
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_no_vanilla_tensors(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
if vanilla_tensors:
......@@ -105,24 +97,20 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_transformable(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
next(iter(dataset.map(transforms.Identity())))
@pytest.mark.parametrize("only_datapipe", [False, True])
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_traversable(self, dataset_mock, config, only_datapipe):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
traverse(dataset, only_datapipe=only_datapipe)
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_serializable(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
pickle.dumps(dataset)
......@@ -135,8 +123,7 @@ class TestCommon:
@pytest.mark.parametrize("num_workers", [0, 1])
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_data_loader(self, dataset_mock, config, num_workers):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
dl = DataLoader(
dataset,
......@@ -153,17 +140,15 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
@pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_save_load(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
sample = next(iter(dataset))
with io.BytesIO() as buffer:
......@@ -173,8 +158,7 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_infinite_buffer_size(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
for dp in extract_datapipes(dataset):
if hasattr(dp, "buffer_size"):
......@@ -184,8 +168,7 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_has_length(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
assert len(dataset) > 0
......@@ -193,9 +176,7 @@ class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
class TestQMNIST:
def test_extra_label(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
sample = next(iter(dataset))
for key, type in (
......@@ -218,9 +199,7 @@ class TestGTSRB:
if config["split"] != "train":
return
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
for sample in dataset:
label_from_path = int(Path(sample["path"]).parent.name)
......@@ -230,9 +209,7 @@ class TestGTSRB:
@parametrize_dataset_mocks(DATASET_MOCKS["usps"])
class TestUSPS:
def test_sample_content(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)
for sample in dataset:
assert "image" in sample
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment