Unverified Commit 3e4d062c authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Expand tests for prototype datasets (#5187)

* refactor prototype datasets tests

* skip tests with insufficient third party dependencies

* cleanup

* add tests for SBD prototype dataset

* add tests for SEMEION prototype dataset

* add tests for VOC prototype dataset

* add tests for CelebA prototype dataset

* add tests for DTD prototype dataset

* add tests for FER2013 prototype dataset

* add tests for CLEVR prototype dataset

* add tests for oxford-iiit-pet prototype dataset

* enforce tests for new datasets

* add missing archive generation for oxford-iiit-pet tests

* add tests for CUB200 prototype datasets

* fix split generation

* add capability to mark parametrization and xfail cub200 traverse tests
parent bf073e78
This diff is collapsed.
......@@ -6,17 +6,28 @@ from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS
from torch.utils.data.datapipes.iter.grouping import ShardingFilterIterDataPipe as ShardingFilter
from torch.utils.data.graph import traverse
from torchdata.datapipes.iter import IterDataPipe, Shuffler
from torchvision.prototype import transforms
from torchvision.prototype import transforms, datasets
from torchvision.prototype.utils._internal import sequence_to_str
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_coverage():
untested_datasets = set(datasets.list()) - DATASET_MOCKS.keys()
if untested_datasets:
raise AssertionError(
f"The dataset(s) {sequence_to_str(sorted(untested_datasets), separate_last='and ')} "
f"are exposed through `torchvision.prototype.datasets.load()`, but are not tested. "
f"Please add mock data to `test/builtin_dataset_mocks.py`."
)
class TestCommon:
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_smoke(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
if not isinstance(dataset, IterDataPipe):
raise AssertionError(f"Loading the dataset should return an IterDataPipe, but got {type(dataset)} instead.")
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_sample(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
......@@ -31,6 +42,7 @@ class TestCommon:
if not sample:
raise AssertionError("Sample dictionary is empty.")
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_num_samples(self, dataset_mock, config):
dataset, mock_info = dataset_mock.load(config)
......@@ -40,6 +52,7 @@ class TestCommon:
assert num_samples == mock_info["num_samples"]
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_decoding(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
......@@ -50,6 +63,7 @@ class TestCommon:
f"{sequence_to_str(sorted(undecoded_features), separate_last='and ')} were not decoded."
)
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_no_vanilla_tensors(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
......@@ -60,16 +74,33 @@ class TestCommon:
f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors."
)
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_transformable(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
next(iter(dataset.map(transforms.Identity())))
@parametrize_dataset_mocks(
DATASET_MOCKS,
marks={
"cub200": pytest.mark.xfail(
reason="See https://github.com/pytorch/vision/pull/5187#issuecomment-1015479165"
)
},
)
def test_traversable(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
traverse(dataset)
@parametrize_dataset_mocks(
DATASET_MOCKS,
marks={
"cub200": pytest.mark.xfail(
reason="See https://github.com/pytorch/vision/pull/5187#issuecomment-1015479165"
)
},
)
@pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter), ids=lambda type: type.__name__)
def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
def scan(graph):
......@@ -86,8 +117,8 @@ class TestCommon:
raise AssertionError(f"The dataset doesn't comprise a {annotation_dp_type.__name__}() datapipe.")
@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
class TestQMNIST:
@parametrize_dataset_mocks([mock for mock in DATASET_MOCKS if mock.name == "qmnist"])
def test_extra_label(self, dataset_mock, config):
dataset, _ = dataset_mock.load(config)
......
......@@ -26,6 +26,7 @@ from torchvision.prototype.datasets.utils._internal import (
hint_sharding,
hint_shuffling,
)
from torchvision.prototype.features import Feature, Label, BoundingBox
csv.register_dialect("celeba", delimiter=" ", skipinitialspace=True)
......@@ -67,6 +68,7 @@ class CelebA(Dataset):
"celeba",
type=DatasetType.IMAGE,
homepage="https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html",
valid_options=dict(split=("train", "val", "test")),
)
def resources(self, config: DatasetConfig) -> List[OnlineResource]:
......@@ -104,7 +106,7 @@ class CelebA(Dataset):
_SPLIT_ID_TO_NAME = {
"0": "train",
"1": "valid",
"1": "val",
"2": "test",
}
......@@ -117,22 +119,22 @@ class CelebA(Dataset):
def _collate_and_decode_sample(
self,
data: Tuple[Tuple[str, Tuple[str, List[str]], Tuple[str, io.IOBase]], Tuple[str, Dict[str, Any]]],
data: Tuple[Tuple[str, Tuple[Tuple[str, Dict[str, Any]], Tuple[str, io.IOBase]]], Tuple[str, Dict[str, Any]]],
*,
decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
) -> Dict[str, Any]:
split_and_image_data, ann_data = data
_, _, image_data = split_and_image_data
_, (_, image_data) = split_and_image_data
path, buffer = image_data
_, ann = ann_data
image = decoder(buffer) if decoder else buffer
identity = int(ann["identity"]["identity"])
identity = Label(int(ann["identity"]["identity"]))
attributes = {attr: value == "1" for attr, value in ann["attributes"].items()}
bbox = torch.tensor([int(ann["bbox"][key]) for key in ("x_1", "y_1", "width", "height")])
bbox = BoundingBox([int(ann["bbox"][key]) for key in ("x_1", "y_1", "width", "height")])
landmarks = {
landmark: torch.tensor((int(ann["landmarks"][f"{landmark}_x"]), int(ann["landmarks"][f"{landmark}_y"])))
landmark: Feature((int(ann["landmarks"][f"{landmark}_x"]), int(ann["landmarks"][f"{landmark}_y"])))
for landmark in {key[:-2] for key in ann["landmarks"].keys()}
}
......
......@@ -105,7 +105,7 @@ class CUB200(Dataset):
path = pathlib.Path(data[0])
return path.with_suffix(".jpg").name
def _2011_decode_ann(
def _2011_load_ann(
self,
data: Tuple[str, Tuple[List[str], Tuple[str, io.IOBase]]],
*,
......@@ -126,7 +126,7 @@ class CUB200(Dataset):
path = pathlib.Path(data[0])
return path.with_suffix(".jpg").name, data
def _2010_decode_ann(
def _2010_load_ann(
self, data: Tuple[str, Tuple[str, io.IOBase]], *, decoder: Optional[Callable[[io.IOBase], torch.Tensor]]
) -> Dict[str, Any]:
_, (path, buffer) = data
......@@ -154,7 +154,7 @@ class CUB200(Dataset):
label_str, category = dir_name.split(".")
return dict(
(self._2011_decode_ann if year == "2011" else self._2010_decode_ann)(anns_data, decoder=decoder),
(self._2011_load_ann if year == "2011" else self._2010_load_ann)(anns_data, decoder=decoder),
image=decoder(buffer) if decoder else buffer,
label=Label(int(label_str), category=category),
)
......@@ -196,7 +196,7 @@ class CUB200(Dataset):
else: # config.year == "2010"
split_dp, images_dp, anns_dp = resource_dps
split_dp = Filter(split_dp, path_comparator("stem", config.split))
split_dp = Filter(split_dp, path_comparator("name", f"{config.split}.txt"))
split_dp = LineReader(split_dp, decode=True, return_path=False)
split_dp = Mapper(split_dp, self._2010_split_key)
......
import enum
import functools
import io
import pathlib
from typing import Any, Callable, Dict, List, Optional, Tuple
......@@ -126,7 +127,7 @@ class DTD(Dataset):
ref_key_fn=self._image_key_fn,
buffer_size=INFINITE_BUFFER_SIZE,
)
return Mapper(dp, self._collate_and_decode_sample, fn_kwargs=dict(decoder=decoder))
return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
def _filter_images(self, data: Tuple[str, Any]) -> bool:
return self._classify_archive(data) == DTDDemux.IMAGES
......
......@@ -31,6 +31,7 @@ from torchvision.prototype.datasets.utils._internal import (
hint_sharding,
hint_shuffling,
)
from torchvision.prototype.features import Feature
class SBD(Dataset):
......@@ -83,11 +84,11 @@ class SBD(Dataset):
# the boundaries are stored in sparse CSC format, which is not supported by PyTorch
boundaries = (
torch.as_tensor(np.stack([raw_boundary[0].toarray() for raw_boundary in raw_boundaries]))
Feature(np.stack([raw_boundary[0].toarray() for raw_boundary in raw_boundaries]))
if decode_boundaries
else None
)
segmentation = torch.as_tensor(raw_segmentation) if decode_segmentation else None
segmentation = Feature(raw_segmentation) if decode_segmentation else None
return boundaries, segmentation
......@@ -140,6 +141,7 @@ class SBD(Dataset):
if config.split == "train_noval":
split_dp = extra_split_dp
split_dp = Filter(split_dp, path_comparator("stem", config.split))
split_dp = LineReader(split_dp, decode=True)
split_dp = hint_sharding(split_dp)
split_dp = hint_shuffling(split_dp)
......
......@@ -18,6 +18,7 @@ from torchvision.prototype.datasets.utils import (
DatasetType,
)
from torchvision.prototype.datasets.utils._internal import image_buffer_from_array, hint_sharding, hint_shuffling
from torchvision.prototype.features import Image, Label
class SEMEION(Dataset):
......@@ -46,14 +47,13 @@ class SEMEION(Dataset):
label_data = [int(label) for label in data[256:] if label]
if decoder is raw:
image = image_data.unsqueeze(0)
image = Image(image_data.unsqueeze(0))
else:
image_buffer = image_buffer_from_array(image_data.numpy())
image = decoder(image_buffer) if decoder else image_buffer # type: ignore[assignment]
label = next((idx for idx, one_hot_label in enumerate(label_data) if one_hot_label))
category = self.info.categories[label]
return dict(image=image, label=label, category=category)
label_idx = next((idx for idx, one_hot_label in enumerate(label_data) if one_hot_label))
return dict(image=image, label=Label(label_idx, category=self.info.categories[label_idx]))
def _make_datapipe(
self,
......
......@@ -30,34 +30,50 @@ from torchvision.prototype.datasets.utils._internal import (
hint_sharding,
hint_shuffling,
)
from torchvision.prototype.features import BoundingBox
HERE = pathlib.Path(__file__).parent
class VOCDatasetInfo(DatasetInfo):
def __init__(self, *args: Any, **kwargs: Any):
super().__init__(*args, **kwargs)
self._configs = tuple(config for config in self._configs if config.split != "test" or config.year == "2007")
def make_config(self, **options: Any) -> DatasetConfig:
config = super().make_config(**options)
if config.split == "test" and config.year != "2007":
raise ValueError("`split='test'` is only available for `year='2007'`")
return config
class VOC(Dataset):
def _make_info(self) -> DatasetInfo:
return DatasetInfo(
return VOCDatasetInfo(
"voc",
type=DatasetType.IMAGE,
homepage="http://host.robots.ox.ac.uk/pascal/VOC/",
valid_options=dict(
split=("train", "val", "test"),
year=("2012",),
split=("train", "val", "trainval", "test"),
year=("2012", "2007", "2008", "2009", "2010", "2011"),
task=("detection", "segmentation"),
),
)
_TRAIN_VAL_ARCHIVES = {
"2007": ("VOCtrainval_06-Nov-2007.tar", "7d8cd951101b0957ddfd7a530bdc8a94f06121cfc1e511bb5937e973020c7508"),
"2008": ("VOCtrainval_14-Jul-2008.tar", "7f0ca53c1b5a838fbe946965fc106c6e86832183240af5c88e3f6c306318d42e"),
"2009": ("VOCtrainval_11-May-2009.tar", "11cbe1741fb5bdadbbca3c08e9ec62cd95c14884845527d50847bc2cf57e7fd6"),
"2010": ("VOCtrainval_03-May-2010.tar", "1af4189cbe44323ab212bff7afbc7d0f55a267cc191eb3aac911037887e5c7d4"),
"2011": ("VOCtrainval_25-May-2011.tar", "0a7f5f5d154f7290ec65ec3f78b72ef72c6d93ff6d79acd40dc222a9ee5248ba"),
"2012": ("VOCtrainval_11-May-2012.tar", "e14f763270cf193d0b5f74b169f44157a4b0c6efa708f4dd0ff78ee691763bcb"),
}
_TEST_ARCHIVES = {
"2007": ("VOCtest_06-Nov-2007.tar", "6836888e2e01dca84577a849d339fa4f73e1e4f135d312430c4856b5609b4892")
}
def resources(self, config: DatasetConfig) -> List[OnlineResource]:
if config.year == "2012":
if config.split == "train":
archive = HttpResource(
"http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar",
sha256="e14f763270cf193d0b5f74b169f44157a4b0c6efa708f4dd0ff78ee691763bcb",
)
else:
raise RuntimeError("FIXME")
else:
raise RuntimeError("FIXME")
file_name, sha256 = (self._TEST_ARCHIVES if config.split == "test" else self._TRAIN_VAL_ARCHIVES)[config.year]
archive = HttpResource(f"http://host.robots.ox.ac.uk/pascal/VOC/voc{config.year}/{file_name}", sha256=sha256)
return [archive]
_ANNS_FOLDER = dict(
......@@ -88,7 +104,7 @@ class VOC(Dataset):
objects = result["annotation"]["object"]
bboxes = [obj["bndbox"] for obj in objects]
bboxes = [[int(bbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")] for bbox in bboxes]
return torch.tensor(bboxes)
return BoundingBox(bboxes)
def _collate_and_decode_sample(
self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment