Expand tests for prototype datasets (#5187)

* refactor prototype datasets tests * skip tests with insufficient third party dependencies * cleanup * add tests for SBD prototype dataset * add tests for SEMEION prototype dataset * add tests for VOC prototype dataset * add tests for CelebA prototype dataset * add tests for DTD prototype dataset * add tests for FER2013 prototype dataset * add tests for CLEVR prototype dataset * add tests for oxford-iiit-pet prototype dataset * enforce tests for new datasets * add missing archive generation for oxford-iiit-pet tests * add tests for CUB200 prototype datasets * fix split generation * add capability to mark parametrization and xfail cub200 traverse tests

Expand tests for prototype datasets (#5187)
* refactor prototype datasets tests * skip tests with insufficient third party dependencies * cleanup * add tests for SBD prototype dataset * add tests for SEMEION prototype dataset * add tests for VOC prototype dataset * add tests for CelebA prototype dataset * add tests for DTD prototype dataset * add tests for FER2013 prototype dataset * add tests for CLEVR prototype dataset * add tests for oxford-iiit-pet prototype dataset * enforce tests for new datasets * add missing archive generation for oxford-iiit-pet tests * add tests for CUB200 prototype datasets * fix split generation * add capability to mark parametrization and xfail cub200 traverse tests
3e4d062c · Philip Meier · GitHub · bf073e78 · 3e4d062c · 3e4d062c
Unverified Commit 3e4d062c authored Jan 19, 2022 by Philip Meier Committed by GitHub Jan 19, 2022
8 changed files
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -6,17 +6,28 @@ from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS
 from torch.utils.data.datapipes.iter.grouping import ShardingFilterIterDataPipe as ShardingFilter
 from torch.utils.data.graph import traverse
 from torchdata.datapipes.iter import IterDataPipe, Shuffler
-from torchvision.prototype import transforms
+from torchvision.prototype import transforms, datasets
 from torchvision.prototype.utils._internal import sequence_to_str


-@parametrize_dataset_mocks(DATASET_MOCKS)
+def test_coverage():
+    untested_datasets = set(datasets.list()) - DATASET_MOCKS.keys()
+    if untested_datasets:
+        raise AssertionError(
+            f"The dataset(s) {sequence_to_str(sorted(untested_datasets), separate_last='and ')} "
+            f"are exposed through `torchvision.prototype.datasets.load()`, but are not tested. "
+            f"Please add mock data to `test/builtin_dataset_mocks.py`."
+        )
+
+
 class TestCommon:
+    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_smoke(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)
        if not isinstance(dataset, IterDataPipe):
            raise AssertionError(f"Loading the dataset should return an IterDataPipe, but got {type(dataset)} instead.")

+    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_sample(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)

@@ -31,6 +42,7 @@ class TestCommon:
        if not sample:
            raise AssertionError("Sample dictionary is empty.")

+    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_num_samples(self, dataset_mock, config):
        dataset, mock_info = dataset_mock.load(config)

@@ -40,6 +52,7 @@ class TestCommon:

        assert num_samples == mock_info["num_samples"]

+    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_decoding(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)

@@ -50,6 +63,7 @@ class TestCommon:
                f"{sequence_to_str(sorted(undecoded_features), separate_last='and ')} were not decoded."
            )

+    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_no_vanilla_tensors(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)

@@ -60,16 +74,33 @@ class TestCommon:
                f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors."
            )

+    @parametrize_dataset_mocks(DATASET_MOCKS)
    def test_transformable(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)

        next(iter(dataset.map(transforms.Identity())))

+    @parametrize_dataset_mocks(
+        DATASET_MOCKS,
+        marks={
+            "cub200": pytest.mark.xfail(
+                reason="See https://github.com/pytorch/vision/pull/5187#issuecomment-1015479165"
+            )
+        },
+    )
    def test_traversable(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)

        traverse(dataset)

+    @parametrize_dataset_mocks(
+        DATASET_MOCKS,
+        marks={
+            "cub200": pytest.mark.xfail(
+                reason="See https://github.com/pytorch/vision/pull/5187#issuecomment-1015479165"
+            )
+        },
+    )
    @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter), ids=lambda type: type.__name__)
    def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
        def scan(graph):
@@ -86,8 +117,8 @@ class TestCommon:
            raise AssertionError(f"The dataset doesn't comprise a {annotation_dp_type.__name__}() datapipe.")


+@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
 class TestQMNIST:
-    @parametrize_dataset_mocks([mock for mock in DATASET_MOCKS if mock.name == "qmnist"])
    def test_extra_label(self, dataset_mock, config):
        dataset, _ = dataset_mock.load(config)


--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -26,6 +26,7 @@ from torchvision.prototype.datasets.utils._internal import (
    hint_sharding,
    hint_shuffling,
 )
+from torchvision.prototype.features import Feature, Label, BoundingBox

 csv.register_dialect("celeba", delimiter=" ", skipinitialspace=True)

@@ -67,6 +68,7 @@ class CelebA(Dataset):
            "celeba",
            type=DatasetType.IMAGE,
            homepage="https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html",
+            valid_options=dict(split=("train", "val", "test")),
        )

    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
@@ -104,7 +106,7 @@ class CelebA(Dataset):

    _SPLIT_ID_TO_NAME = {
        "0": "train",
-        "1": "valid",
+        "1": "val",
        "2": "test",
    }

@@ -117,22 +119,22 @@ class CelebA(Dataset):

    def _collate_and_decode_sample(
        self,
-        data: Tuple[Tuple[str, Tuple[str, List[str]], Tuple[str, io.IOBase]], Tuple[str, Dict[str, Any]]],
+        data: Tuple[Tuple[str, Tuple[Tuple[str, Dict[str, Any]], Tuple[str, io.IOBase]]], Tuple[str, Dict[str, Any]]],
        *,
        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
    ) -> Dict[str, Any]:
        split_and_image_data, ann_data = data
-        _, _, image_data = split_and_image_data
+        _, (_, image_data) = split_and_image_data
        path, buffer = image_data
        _, ann = ann_data

        image = decoder(buffer) if decoder else buffer

-        identity = int(ann["identity"]["identity"])
+        identity = Label(int(ann["identity"]["identity"]))
        attributes = {attr: value == "1" for attr, value in ann["attributes"].items()}
-        bbox = torch.tensor([int(ann["bbox"][key]) for key in ("x_1", "y_1", "width", "height")])
+        bbox = BoundingBox([int(ann["bbox"][key]) for key in ("x_1", "y_1", "width", "height")])
        landmarks = {
-            landmark: torch.tensor((int(ann["landmarks"][f"{landmark}_x"]), int(ann["landmarks"][f"{landmark}_y"])))
+            landmark: Feature((int(ann["landmarks"][f"{landmark}_x"]), int(ann["landmarks"][f"{landmark}_y"])))
            for landmark in {key[:-2] for key in ann["landmarks"].keys()}
        }


--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -105,7 +105,7 @@ class CUB200(Dataset):
        path = pathlib.Path(data[0])
        return path.with_suffix(".jpg").name

-    def _2011_decode_ann(
+    def _2011_load_ann(
        self,
        data: Tuple[str, Tuple[List[str], Tuple[str, io.IOBase]]],
        *,
@@ -126,7 +126,7 @@ class CUB200(Dataset):
        path = pathlib.Path(data[0])
        return path.with_suffix(".jpg").name, data

-    def _2010_decode_ann(
+    def _2010_load_ann(
        self, data: Tuple[str, Tuple[str, io.IOBase]], *, decoder: Optional[Callable[[io.IOBase], torch.Tensor]]
    ) -> Dict[str, Any]:
        _, (path, buffer) = data
@@ -154,7 +154,7 @@ class CUB200(Dataset):
        label_str, category = dir_name.split(".")

        return dict(
-            (self._2011_decode_ann if year == "2011" else self._2010_decode_ann)(anns_data, decoder=decoder),
+            (self._2011_load_ann if year == "2011" else self._2010_load_ann)(anns_data, decoder=decoder),
            image=decoder(buffer) if decoder else buffer,
            label=Label(int(label_str), category=category),
        )
@@ -196,7 +196,7 @@ class CUB200(Dataset):
        else:  # config.year == "2010"
            split_dp, images_dp, anns_dp = resource_dps

-            split_dp = Filter(split_dp, path_comparator("stem", config.split))
+            split_dp = Filter(split_dp, path_comparator("name", f"{config.split}.txt"))
            split_dp = LineReader(split_dp, decode=True, return_path=False)
            split_dp = Mapper(split_dp, self._2010_split_key)


--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ b/torchvision/prototype/datasets/_builtin/dtd.py
 import enum
+import functools
 import io
 import pathlib
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -126,7 +127,7 @@ class DTD(Dataset):
            ref_key_fn=self._image_key_fn,
            buffer_size=INFINITE_BUFFER_SIZE,
        )
-        return Mapper(dp, self._collate_and_decode_sample, fn_kwargs=dict(decoder=decoder))
+        return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))

    def _filter_images(self, data: Tuple[str, Any]) -> bool:
        return self._classify_archive(data) == DTDDemux.IMAGES

--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ b/torchvision/prototype/datasets/_builtin/sbd.py
@@ -31,6 +31,7 @@ from torchvision.prototype.datasets.utils._internal import (
    hint_sharding,
    hint_shuffling,
 )
+from torchvision.prototype.features import Feature


 class SBD(Dataset):
@@ -83,11 +84,11 @@ class SBD(Dataset):

        # the boundaries are stored in sparse CSC format, which is not supported by PyTorch
        boundaries = (
-            torch.as_tensor(np.stack([raw_boundary[0].toarray() for raw_boundary in raw_boundaries]))
+            Feature(np.stack([raw_boundary[0].toarray() for raw_boundary in raw_boundaries]))
            if decode_boundaries
            else None
        )
-        segmentation = torch.as_tensor(raw_segmentation) if decode_segmentation else None
+        segmentation = Feature(raw_segmentation) if decode_segmentation else None

        return boundaries, segmentation

@@ -140,6 +141,7 @@ class SBD(Dataset):

        if config.split == "train_noval":
            split_dp = extra_split_dp
+        split_dp = Filter(split_dp, path_comparator("stem", config.split))
        split_dp = LineReader(split_dp, decode=True)
        split_dp = hint_sharding(split_dp)
        split_dp = hint_shuffling(split_dp)

--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ b/torchvision/prototype/datasets/_builtin/semeion.py
@@ -18,6 +18,7 @@ from torchvision.prototype.datasets.utils import (
    DatasetType,
 )
 from torchvision.prototype.datasets.utils._internal import image_buffer_from_array, hint_sharding, hint_shuffling
+from torchvision.prototype.features import Image, Label


 class SEMEION(Dataset):
@@ -46,14 +47,13 @@ class SEMEION(Dataset):
        label_data = [int(label) for label in data[256:] if label]

        if decoder is raw:
-            image = image_data.unsqueeze(0)
+            image = Image(image_data.unsqueeze(0))
        else:
            image_buffer = image_buffer_from_array(image_data.numpy())
            image = decoder(image_buffer) if decoder else image_buffer  # type: ignore[assignment]

-        label = next((idx for idx, one_hot_label in enumerate(label_data) if one_hot_label))
-        category = self.info.categories[label]
-        return dict(image=image, label=label, category=category)
+        label_idx = next((idx for idx, one_hot_label in enumerate(label_data) if one_hot_label))
+        return dict(image=image, label=Label(label_idx, category=self.info.categories[label_idx]))

    def _make_datapipe(
        self,

--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -30,34 +30,50 @@ from torchvision.prototype.datasets.utils._internal import (
    hint_sharding,
    hint_shuffling,
 )
+from torchvision.prototype.features import BoundingBox

-HERE = pathlib.Path(__file__).parent
+
+class VOCDatasetInfo(DatasetInfo):
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__(*args, **kwargs)
+        self._configs = tuple(config for config in self._configs if config.split != "test" or config.year == "2007")
+
+    def make_config(self, **options: Any) -> DatasetConfig:
+        config = super().make_config(**options)
+        if config.split == "test" and config.year != "2007":
+            raise ValueError("`split='test'` is only available for `year='2007'`")
+
+        return config


 class VOC(Dataset):
    def _make_info(self) -> DatasetInfo:
-        return DatasetInfo(
+        return VOCDatasetInfo(
            "voc",
            type=DatasetType.IMAGE,
            homepage="http://host.robots.ox.ac.uk/pascal/VOC/",
            valid_options=dict(
-                split=("train", "val", "test"),
-                year=("2012",),
+                split=("train", "val", "trainval", "test"),
+                year=("2012", "2007", "2008", "2009", "2010", "2011"),
                task=("detection", "segmentation"),
            ),
        )

+    _TRAIN_VAL_ARCHIVES = {
+        "2007": ("VOCtrainval_06-Nov-2007.tar", "7d8cd951101b0957ddfd7a530bdc8a94f06121cfc1e511bb5937e973020c7508"),
+        "2008": ("VOCtrainval_14-Jul-2008.tar", "7f0ca53c1b5a838fbe946965fc106c6e86832183240af5c88e3f6c306318d42e"),
+        "2009": ("VOCtrainval_11-May-2009.tar", "11cbe1741fb5bdadbbca3c08e9ec62cd95c14884845527d50847bc2cf57e7fd6"),
+        "2010": ("VOCtrainval_03-May-2010.tar", "1af4189cbe44323ab212bff7afbc7d0f55a267cc191eb3aac911037887e5c7d4"),
+        "2011": ("VOCtrainval_25-May-2011.tar", "0a7f5f5d154f7290ec65ec3f78b72ef72c6d93ff6d79acd40dc222a9ee5248ba"),
+        "2012": ("VOCtrainval_11-May-2012.tar", "e14f763270cf193d0b5f74b169f44157a4b0c6efa708f4dd0ff78ee691763bcb"),
+    }
+    _TEST_ARCHIVES = {
+        "2007": ("VOCtest_06-Nov-2007.tar", "6836888e2e01dca84577a849d339fa4f73e1e4f135d312430c4856b5609b4892")
+    }
+
    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
-        if config.year == "2012":
-            if config.split == "train":
-                archive = HttpResource(
-                    "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
-                    sha256="e14f763270cf193d0b5f74b169f44157a4b0c6efa708f4dd0ff78ee691763bcb",
-                )
-            else:
-                raise RuntimeError("FIXME")
-        else:
-            raise RuntimeError("FIXME")
+        file_name, sha256 = (self._TEST_ARCHIVES if config.split == "test" else self._TRAIN_VAL_ARCHIVES)[config.year]
+        archive = HttpResource(f"http://host.robots.ox.ac.uk/pascal/VOC/voc{config.year}/{file_name}", sha256=sha256)
        return [archive]

    _ANNS_FOLDER = dict(
@@ -88,7 +104,7 @@ class VOC(Dataset):
        objects = result["annotation"]["object"]
        bboxes = [obj["bndbox"] for obj in objects]
        bboxes = [[int(bbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")] for bbox in bboxes]
-        return torch.tensor(bboxes)
+        return BoundingBox(bboxes)

    def _collate_and_decode_sample(
        self,