Merge mock data preparation and dataset logic in prototype tests (#6010)

* merge mock data preparation and loading * address comments * fix extra file creation * remove tmp folder * inline images meta creation in coco mock data

Merge mock data preparation and dataset logic in prototype tests (#6010)
* merge mock data preparation and loading * address comments * fix extra file creation * remove tmp folder * inline images meta creation in coco mock data
08c8f0e0 · Philip Meier · GitHub · d9a69506 · 08c8f0e0 · 08c8f0e0
Unverified Commit 08c8f0e0 authored May 17, 2022 by Philip Meier Committed by GitHub May 17, 2022
Show whitespace changes
Inline Side-by-side

Showing with 94 additions and 84 deletions

test/builtin_dataset_mocks.py test/builtin_dataset_mocks.py +78 -45

test/test_prototype_builtin_datasets.py test/test_prototype_builtin_datasets.py +16 -39

No files found.
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -10,19 +10,18 @@ import lzma
 import pathlib
 import pickle
 import random
+import shutil
 import unittest.mock
 import warnings
 import xml.etree.ElementTree as ET
 from collections import defaultdict, Counter

 import numpy as np
-import PIL.Image
 import pytest
 import torch
 from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
-from torchvision._utils import sequence_to_str
 from torchvision.prototype import datasets

 make_tensor = functools.partial(_make_tensor, device="cpu")
@@ -62,27 +61,51 @@ class DatasetMock:

        return mock_info

-    def prepare(self, config):
+    def load(self, config):
        # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
        # test/test_prototype_builtin_datasets.py
        root = pathlib.Path(datasets.home()) / self.name
-        root.mkdir(exist_ok=True)
+        # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
+        # this will only download **and** preprocess if the file is not present. In other words, if we already place
+        # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
+        # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
+        # `root` only when it is requested.
+        tmp_mock_data_folder = root / "__mock__"
+        tmp_mock_data_folder.mkdir(parents=True)
+
+        mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
+
+        def patched_download(resource, root, **kwargs):
+            src = tmp_mock_data_folder / resource.file_name
+            if not src.exists():
+                raise pytest.UsageError(
+                    f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
+                    f"but it was not created by the mock data function."
+                )

-        mock_info = self._parse_mock_info(self.mock_data_fn(root, config))
+            dst = root / resource.file_name
+            shutil.move(str(src), str(root))

-        with unittest.mock.patch.object(datasets.utils.Dataset, "__init__"):
-            required_file_names = {
-                resource.file_name for resource in datasets.load(self.name, root=root, **config)._resources()
-            }
-        available_file_names = {path.name for path in root.glob("*")}
-        missing_file_names = required_file_names - available_file_names
-        if missing_file_names:
+            return dst
+
+        with unittest.mock.patch(
+            "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
+        ):
+            dataset = datasets.load(self.name, **config)
+
+        extra_files = list(tmp_mock_data_folder.glob("**/*"))
+        if extra_files:
            raise pytest.UsageError(
-                f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} "
-                f"for {config}, but they were not created by the mock data function."
+                (
+                    f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
+                    f"but they were not loaded:\n\n"
+                )
+                + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
            )

-        return mock_info
+        tmp_mock_data_folder.rmdir()
+
+        return dataset, mock_info


 def config_id(name, config):
@@ -513,22 +536,6 @@ def imagenet(root, config):


 class CocoMockData:
-    @classmethod
-    def _make_images_archive(cls, root, name, *, num_samples):
-        image_paths = create_image_folder(
-            root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
-        )
-
-        images_meta = []
-        for path in image_paths:
-            with PIL.Image.open(path) as image:
-                width, height = image.size
-            images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))
-
-        make_zip(root, f"{name}.zip")
-
-        return images_meta
-
    @classmethod
    def _make_annotations_json(
        cls,
@@ -596,16 +603,38 @@ class CocoMockData:
        cls,
        root,
        *,
+        split,
        year,
        num_samples,
    ):
        annotations_dir = root / "annotations"
        annotations_dir.mkdir()

-        for split in ("train", "val"):
-            config_name = f"{split}{year}"
+        for split_ in ("train", "val"):
+            config_name = f"{split_}{year}"
+
+            images_meta = [
+                dict(
+                    file_name=f"{idx:012d}.jpg",
+                    id=idx,
+                    width=width,
+                    height=height,
+                )
+                for idx, (height, width) in enumerate(
+                    torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
+                )
+            ]
+
+            if split_ == split:
+                create_image_folder(
+                    root,
+                    config_name,
+                    file_name_fn=lambda idx: images_meta[idx]["file_name"],
+                    num_examples=num_samples,
+                    size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
+                )
+                make_zip(root, f"{config_name}.zip")

-            images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
            cls._make_annotations(
                annotations_dir,
                config_name,
@@ -625,7 +654,7 @@ class CocoMockData:
    )
 )
 def coco(root, config):
-    return CocoMockData.generate(root, year=config["year"], num_samples=5)
+    return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)


 class SBDMockData:
@@ -799,8 +828,11 @@ class VOCMockData:
    def generate(cls, root, *, year, trainval):
        archive_folder = root
        if year == "2011":
-            archive_folder /= "TrainVal"
-        data_folder = archive_folder / "VOCdevkit" / f"VOC{year}"
+            archive_folder = root / "TrainVal"
+            data_folder = archive_folder / "VOCdevkit"
+        else:
+            archive_folder = data_folder = root / "VOCdevkit"
+        data_folder = data_folder / f"VOC{year}"
        data_folder.mkdir(parents=True, exist_ok=True)

        ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
@@ -810,7 +842,7 @@ class VOCMockData:
            (cls._make_detection_anns_folder, "Annotations", ".xml"),
        ]:
            make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
-        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder)
+        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)

        return num_samples_map

@@ -1091,8 +1123,10 @@ def gtsrb(root, config):
                    }
                )

+    archive_folder = root / "GTSRB"
+
    if config["split"] == "train":
-        train_folder = root / "GTSRB" / "Training"
+        train_folder = archive_folder / "Training"
        train_folder.mkdir(parents=True)

        for class_idx in classes:
@@ -1107,9 +1141,9 @@ def gtsrb(root, config):
                num_examples=num_examples_per_class,
                class_idx=int(class_idx),
            )
-        make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
+        make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
    else:
-        test_folder = root / "GTSRB" / "Final_Test"
+        test_folder = archive_folder / "Final_Test"
        test_folder.mkdir(parents=True)

        create_image_folder(
@@ -1119,7 +1153,7 @@ def gtsrb(root, config):
            num_examples=num_examples,
        )

-        make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
+        make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)

        _make_ann_file(
            path=root / "GT-final_test.csv",
@@ -1484,11 +1518,10 @@ def stanford_cars(root, config):
    num_samples = {"train": 5, "test": 7}[split]
    num_categories = 3

-    devkit = root / "devkit"
-    devkit.mkdir(parents=True)
-
    if split == "train":
        images_folder_name = "cars_train"
+        devkit = root / "devkit"
+        devkit.mkdir()
        annotations_mat_path = devkit / "cars_train_annos.mat"
    else:
        images_folder_name = "cars_test"

--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -56,18 +56,14 @@ class TestCommon:

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_smoke(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        if not isinstance(dataset, datasets.utils.Dataset):
            raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_sample(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        try:
            sample = next(iter(dataset))
@@ -84,17 +80,13 @@ class TestCommon:

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_num_samples(self, dataset_mock, config):
-        mock_info = dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, mock_info = dataset_mock.load(config)

        assert len(list(dataset)) == mock_info["num_samples"]

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_no_vanilla_tensors(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
        if vanilla_tensors:
@@ -105,24 +97,20 @@ class TestCommon:

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_transformable(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        next(iter(dataset.map(transforms.Identity())))

    @pytest.mark.parametrize("only_datapipe", [False, True])
    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_traversable(self, dataset_mock, config, only_datapipe):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        traverse(dataset, only_datapipe=only_datapipe)

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_serializable(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        pickle.dumps(dataset)

@@ -135,8 +123,7 @@ class TestCommon:
    @pytest.mark.parametrize("num_workers", [0, 1])
    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_data_loader(self, dataset_mock, config, num_workers):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        dl = DataLoader(
            dataset,
@@ -153,17 +140,15 @@ class TestCommon:
    @parametrize_dataset_mocks(DATASET_MOCKS)
    @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
    def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
-
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
            raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_save_load(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
+
        sample = next(iter(dataset))

        with io.BytesIO() as buffer:
@@ -173,8 +158,7 @@ class TestCommon:

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_infinite_buffer_size(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        for dp in extract_datapipes(dataset):
            if hasattr(dp, "buffer_size"):
@@ -184,8 +168,7 @@ class TestCommon:

    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_has_length(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        assert len(dataset) > 0

@@ -193,9 +176,7 @@ class TestCommon:
 @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
 class TestQMNIST:
    def test_extra_label(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        sample = next(iter(dataset))
        for key, type in (
@@ -218,9 +199,7 @@ class TestGTSRB:
        if config["split"] != "train":
            return

-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        for sample in dataset:
            label_from_path = int(Path(sample["path"]).parent.name)
@@ -230,9 +209,7 @@ class TestGTSRB:
 @parametrize_dataset_mocks(DATASET_MOCKS["usps"])
 class TestUSPS:
    def test_sample_content(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)

        for sample in dataset:
            assert "image" in sample