builtin_dataset_mocks.py 20.5 KB
Newer Older
1
import contextlib
2
3
import functools
import gzip
4
import itertools
Philip Meier's avatar
Philip Meier committed
5
import json
6
7
8
9
import lzma
import pathlib
import pickle
import tempfile
10
from collections import defaultdict, UserList
11
12

import numpy as np
Philip Meier's avatar
Philip Meier committed
13
import PIL.Image
14
15
import pytest
import torch
16
from datasets_utils import make_zip, make_tar, create_image_folder
17
18
from torch.testing import make_tensor as _make_tensor
from torchvision.prototype import datasets
19
from torchvision.prototype.datasets._api import DEFAULT_DECODER_MAP, DEFAULT_DECODER, find
Philip Meier's avatar
Philip Meier committed
20

21
make_tensor = functools.partial(_make_tensor, device="cpu")
Philip Meier's avatar
Philip Meier committed
22
make_scalar = functools.partial(make_tensor, ())
23

24
25
TEST_HOME = pathlib.Path(tempfile.mkdtemp())

26

27
__all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"]
28
29


30
class ResourceMock(datasets.utils.OnlineResource):
31
32
33
34
35
36
37
38
39
40
41
42
    def __init__(self, *, dataset_name, dataset_config, **kwargs):
        super().__init__(**kwargs)
        self.dataset_name = dataset_name
        self.dataset_config = dataset_config

    def _download(self, _):
        raise pytest.UsageError(
            f"Dataset '{self.dataset_name}' requires the file '{self.file_name}' for {self.dataset_config}, "
            f"but this file does not exist."
        )


43
44
45
46
47
48
class DatasetMock:
    def __init__(self, name, mock_data_fn, *, configs=None):
        self.dataset = find(name)
        self.root = TEST_HOME / self.dataset.name
        self.mock_data_fn = self._parse_mock_data(mock_data_fn)
        self.configs = configs or self.info._configs
49
50
        self._cache = {}

51
52
53
    @property
    def info(self):
        return self.dataset.info
54

55
56
57
    @property
    def name(self):
        return self.info.name
58

59
60
61
    def _parse_mock_data(self, mock_data_fn):
        def wrapper(info, root, config):
            mock_infos = mock_data_fn(info, root, config)
62

63
64
65
66
67
            if mock_infos is None:
                raise pytest.UsageError(
                    f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an "
                    f"integer indicating the number of samples for the current `config`."
                )
68

69
70
71
72
73
74
75
76
            key_types = set(type(key) for key in mock_infos) if isinstance(mock_infos, dict) else {}
            if datasets.utils.DatasetConfig not in key_types:
                mock_infos = {config: mock_infos}
            elif len(key_types) > 1:
                raise pytest.UsageError(
                    f"Unable to handle the returned dictionary of the mock data function for dataset {self.name}. If "
                    f"returned dictionary uses `DatasetConfig` as key type, all keys should be of that type."
                )
77

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
            for config_, mock_info in list(mock_infos.items()):
                if config_ in self._cache:
                    raise pytest.UsageError(
                        f"The mock info for config {config_} of dataset {self.name} generated for config {config} "
                        f"already exists in the cache."
                    )
                if isinstance(mock_info, int):
                    mock_infos[config_] = dict(num_samples=mock_info)
                elif not isinstance(mock_info, dict):
                    raise pytest.UsageError(
                        f"The mock data function for dataset '{self.name}' returned a {type(mock_infos)} for `config` "
                        f"{config_}. The returned object should be a dictionary containing at least the number of "
                        f"samples for the key `'num_samples'`. If no additional information is required for specific "
                        f"tests, the number of samples can also be returned as an integer."
                    )
                elif "num_samples" not in mock_info:
                    raise pytest.UsageError(
                        f"The dictionary returned by the mock data function for dataset '{self.name}' and config "
                        f"{config_} has to contain a `'num_samples'` entry indicating the number of samples."
                    )

            return mock_infos

        return wrapper

    def _load_mock(self, config):
        with contextlib.suppress(KeyError):
            return self._cache[config]

        self.root.mkdir(exist_ok=True)
        for config_, mock_info in self.mock_data_fn(self.info, self.root, config).items():
            mock_resources = [
                ResourceMock(dataset_name=self.name, dataset_config=config_, file_name=resource.file_name)
                for resource in self.dataset.resources(config_)
            ]
            self._cache[config_] = (mock_resources, mock_info)

        return self._cache[config]

    def load(self, config, *, decoder=DEFAULT_DECODER):
        try:
            self.info.check_dependencies()
        except ModuleNotFoundError as error:
            pytest.skip(str(error))
122

123
124
125
        mock_resources, mock_info = self._load_mock(config)
        datapipe = self.dataset._make_datapipe(
            [resource.load(self.root) for resource in mock_resources],
126
            config=config,
127
            decoder=DEFAULT_DECODER_MAP.get(self.info.type) if decoder is DEFAULT_DECODER else decoder,
128
129
130
131
        )
        return datapipe, mock_info


132
133
134
135
136
137
class DatasetMocks(UserList):
    def append_named_callable(self, fn):
        mock_data_fn = fn.__func__ if isinstance(fn, classmethod) else fn
        self.data.append(DatasetMock(mock_data_fn.__name__, mock_data_fn))
        return fn

138

139
DATASET_MOCKS = DatasetMocks()
140

141
142

class MNISTMockData:
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
    _DTYPES_ID = {
        torch.uint8: 8,
        torch.int8: 9,
        torch.int16: 11,
        torch.int32: 12,
        torch.float32: 13,
        torch.float64: 14,
    }

    @classmethod
    def _magic(cls, dtype, ndim):
        return cls._DTYPES_ID[dtype] * 256 + ndim + 1

    @staticmethod
    def _encode(t):
        return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1]

    @staticmethod
    def _big_endian_dtype(dtype):
        np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype
        return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}")

    @classmethod
    def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high):
        with compressor(root / filename, "wb") as fh:
            for meta in (cls._magic(dtype, len(shape)), num_samples, *shape):
                fh.write(cls._encode(meta))

            data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high)

            fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes())

    @classmethod
    def generate(
        cls,
        root,
        *,
        num_categories,
        num_samples=None,
        images_file,
        labels_file,
        image_size=(28, 28),
        image_dtype=torch.uint8,
        label_size=(),
        label_dtype=torch.uint8,
        compressor=None,
    ):
        if num_samples is None:
            num_samples = num_categories
        if compressor is None:
            compressor = gzip.open

        cls._create_binary_file(
            root,
            images_file,
            num_samples=num_samples,
            shape=image_size,
            dtype=image_dtype,
            compressor=compressor,
            high=float("inf"),
        )
        cls._create_binary_file(
            root,
            labels_file,
            num_samples=num_samples,
            shape=label_size,
            dtype=label_dtype,
            compressor=compressor,
            high=num_categories,
        )

        return num_samples


217
@DATASET_MOCKS.append_named_callable
218
219
220
221
def mnist(info, root, config):
    train = config.split == "train"
    images_file = f"{'train' if train else 't10k'}-images-idx3-ubyte.gz"
    labels_file = f"{'train' if train else 't10k'}-labels-idx1-ubyte.gz"
222
    return MNISTMockData.generate(
223
224
225
226
227
228
229
        root,
        num_categories=len(info.categories),
        images_file=images_file,
        labels_file=labels_file,
    )


230
DATASET_MOCKS.extend([DatasetMock(name, mnist) for name in ["fashionmnist", "kmnist"]])
231
232


233
234
@DATASET_MOCKS.append_named_callable
def emnist(info, root, _):
235
236
237
    # The image sets that merge some lower case letters in their respective upper case variant, still use dense
    # labels in the data files. Thus, num_categories != len(categories) there.
    num_categories = defaultdict(
238
        lambda: len(info.categories), {image_set: 47 for image_set in ("Balanced", "By_Merge")}
239
240
    )

241
    mock_infos = {}
242
    file_names = set()
243
244
    for config in info._configs:
        prefix = f"emnist-{config.image_set.replace('_', '').lower()}-{config.split}"
245
246
247
        images_file = f"{prefix}-images-idx3-ubyte.gz"
        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
        file_names.update({images_file, labels_file})
248
249
250
251
252
253
254
        mock_infos[config] = dict(
            num_samples=MNISTMockData.generate(
                root,
                num_categories=num_categories[config.image_set],
                images_file=images_file,
                labels_file=labels_file,
            )
255
256
257
258
        )

    make_zip(root, "emnist-gzip.zip", *file_names)

259
    return mock_infos
260
261


262
@DATASET_MOCKS.append_named_callable
263
264
265
266
267
268
269
def qmnist(info, root, config):
    num_categories = len(info.categories)
    if config.split == "train":
        num_samples = num_samples_gen = num_categories + 2
        prefix = "qmnist-train"
        suffix = ".gz"
        compressor = gzip.open
270
        mock_infos = num_samples
271
    elif config.split.startswith("test"):
272
273
274
        # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create
        # more than 10000 images for the dataset to not be empty.
        num_samples_gen = 10001
275
276
277
        prefix = "qmnist-test"
        suffix = ".gz"
        compressor = gzip.open
278
279
280
281
282
        mock_infos = {
            info.make_config(split="test"): num_samples_gen,
            info.make_config(split="test10k"): min(num_samples_gen, 10_000),
            info.make_config(split="test50k"): num_samples_gen - 10_000,
        }
283
284
285
286
287
    else:  # config.split == "nist"
        num_samples = num_samples_gen = num_categories + 3
        prefix = "xnist"
        suffix = ".xz"
        compressor = lzma.open
288
        mock_infos = num_samples
289

290
    MNISTMockData.generate(
291
292
293
294
295
296
297
298
299
        root,
        num_categories=num_categories,
        num_samples=num_samples_gen,
        images_file=f"{prefix}-images-idx3-ubyte{suffix}",
        labels_file=f"{prefix}-labels-idx2-int{suffix}",
        label_size=(8,),
        label_dtype=torch.int32,
        compressor=compressor,
    )
300
    return mock_infos
301
302


303
class CIFARMockData:
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
    NUM_PIXELS = 32 * 32 * 3

    @classmethod
    def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1):
        content = {
            "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(),
            labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(),
        }
        with open(pathlib.Path(root) / name, "wb") as fh:
            pickle.dump(content, fh)

    @classmethod
    def generate(
        cls,
        root,
        name,
        *,
        folder,
        train_files,
        test_files,
        num_categories,
        labels_key,
    ):
        folder = root / folder
        folder.mkdir()
        files = (*train_files, *test_files)
        for file in files:
            cls._create_batch_file(
                folder,
                file,
                num_categories=num_categories,
                labels_key=labels_key,
            )

        make_tar(root, name, folder, compression="gz")


341
@DATASET_MOCKS.append_named_callable
342
343
344
345
def cifar10(info, root, config):
    train_files = [f"data_batch_{idx}" for idx in range(1, 6)]
    test_files = ["test_batch"]

346
    CIFARMockData.generate(
347
348
349
350
351
352
353
354
355
356
357
358
        root=root,
        name="cifar-10-python.tar.gz",
        folder=pathlib.Path("cifar-10-batches-py"),
        train_files=train_files,
        test_files=test_files,
        num_categories=10,
        labels_key="labels",
    )

    return len(train_files if config.split == "train" else test_files)


359
@DATASET_MOCKS.append_named_callable
360
361
362
363
def cifar100(info, root, config):
    train_files = ["train"]
    test_files = ["test"]

364
    CIFARMockData.generate(
365
366
367
368
369
370
371
372
373
374
375
376
        root=root,
        name="cifar-100-python.tar.gz",
        folder=pathlib.Path("cifar-100-python"),
        train_files=train_files,
        test_files=test_files,
        num_categories=100,
        labels_key="fine_labels",
    )

    return len(train_files if config.split == "train" else test_files)


377
@DATASET_MOCKS.append_named_callable
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
def caltech101(info, root, config):
    def create_ann_file(root, name):
        import scipy.io

        box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16)
        obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy()

        scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour))

    def create_ann_folder(root, name, file_name_fn, num_examples):
        root = pathlib.Path(root) / name
        root.mkdir(parents=True)

        for idx in range(num_examples):
            create_ann_file(root, file_name_fn(idx))

    images_root = root / "101_ObjectCategories"
    anns_root = root / "Annotations"

    ann_category_map = {
        "Faces_2": "Faces",
        "Faces_3": "Faces_easy",
        "Motorbikes_16": "Motorbikes",
        "Airplanes_Side_2": "airplanes",
    }

    num_images_per_category = 2
    for category in info.categories:
        create_image_folder(
            root=images_root,
            name=category,
            file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
            num_examples=num_images_per_category,
        )
        create_ann_folder(
            root=anns_root,
            name=ann_category_map.get(category, category),
            file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
            num_examples=num_images_per_category,
        )

    (images_root / "BACKGROUND_Goodle").mkdir()
    make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")

    make_tar(root, f"{anns_root.name}.tar", anns_root)

    return num_images_per_category * len(info.categories)


427
@DATASET_MOCKS.append_named_callable
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
def caltech256(info, root, config):
    dir = root / "256_ObjectCategories"
    num_images_per_category = 2

    for idx, category in enumerate(info.categories, 1):
        files = create_image_folder(
            dir,
            name=f"{idx:03d}.{category}",
            file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
            num_examples=num_images_per_category,
        )
        if category == "spider":
            open(files[0].parent / "RENAME2", "w").close()

    make_tar(root, f"{dir.name}.tar", dir)

    return num_images_per_category * len(info.categories)


447
@DATASET_MOCKS.append_named_callable
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
def imagenet(info, root, config):
    wnids = tuple(info.extra.wnid_to_category.keys())
    if config.split == "train":
        images_root = root / "ILSVRC2012_img_train"

        num_samples = len(wnids)

        for wnid in wnids:
            files = create_image_folder(
                root=images_root,
                name=wnid,
                file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
                num_examples=1,
            )
            make_tar(images_root, f"{wnid}.tar", files[0].parent)
463
    elif config.split == "val":
464
465
466
467
468
469
470
471
        num_samples = 3
        files = create_image_folder(
            root=root,
            name="ILSVRC2012_img_val",
            file_name_fn=lambda image_idx: f"ILSVRC2012_val_{image_idx + 1:08d}.JPEG",
            num_examples=num_samples,
        )
        images_root = files[0].parent
472
473
    else:  # config.split == "test"
        images_root = root / "ILSVRC2012_img_test_v10102019"
474

475
        num_samples = 3
476

477
478
479
480
481
482
        create_image_folder(
            root=images_root,
            name="test",
            file_name_fn=lambda image_idx: f"ILSVRC2012_test_{image_idx + 1:08d}.JPEG",
            num_examples=num_samples,
        )
483
    make_tar(root, f"{images_root.name}.tar", images_root)
484
485
486
487
488
489
490
491

    devkit_root = root / "ILSVRC2012_devkit_t12"
    devkit_root.mkdir()
    data_root = devkit_root / "data"
    data_root.mkdir()
    with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file:
        for label in torch.randint(0, len(wnids), (num_samples,)).tolist():
            file.write(f"{label}\n")
492
493
494
    make_tar(root, f"{devkit_root}.tar.gz", devkit_root, compression="gz")

    return num_samples
Philip Meier's avatar
Philip Meier committed
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601


class CocoMockData:
    @classmethod
    def _make_images_archive(cls, root, name, *, num_samples):
        image_paths = create_image_folder(
            root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
        )

        images_meta = []
        for path in image_paths:
            with PIL.Image.open(path) as image:
                width, height = image.size
            images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))

        make_zip(root, f"{name}.zip")

        return images_meta

    @classmethod
    def _make_annotations_json(
        cls,
        root,
        name,
        *,
        images_meta,
        fn,
    ):
        num_anns_per_image = torch.randint(1, 5, (len(images_meta),))
        num_anns_total = int(num_anns_per_image.sum())
        ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)])

        anns_meta = []
        for image_meta, num_anns in zip(images_meta, num_anns_per_image):
            for _ in range(num_anns):
                ann_id = int(next(ann_ids_iter))
                anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"]))
        anns_meta.sort(key=lambda ann: ann["id"])

        with open(root / name, "w") as file:
            json.dump(dict(images=images_meta, annotations=anns_meta), file)

        return num_anns_per_image

    @staticmethod
    def _make_instances_data(ann_id, image_meta):
        def make_rle_segmentation():
            height, width = image_meta["height"], image_meta["width"]
            numel = height * width
            counts = []
            while sum(counts) <= numel:
                counts.append(int(torch.randint(5, 8, ())))
            if sum(counts) > numel:
                counts[-1] -= sum(counts) - numel
            return dict(counts=counts, size=[height, width])

        return dict(
            segmentation=make_rle_segmentation(),
            bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(),
            iscrowd=True,
            area=float(make_scalar(dtype=torch.float32)),
            category_id=int(make_scalar(dtype=torch.int64)),
        )

    @staticmethod
    def _make_captions_data(ann_id, image_meta):
        return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.")

    @classmethod
    def _make_annotations(cls, root, name, *, images_meta):
        num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64)
        for annotations, fn in (
            ("instances", cls._make_instances_data),
            ("captions", cls._make_captions_data),
        ):
            num_anns_per_image += cls._make_annotations_json(
                root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn
            )

        return int(num_anns_per_image.sum())

    @classmethod
    def generate(
        cls,
        root,
        *,
        year,
        num_samples,
    ):
        annotations_dir = root / "annotations"
        annotations_dir.mkdir()

        for split in ("train", "val"):
            config_name = f"{split}{year}"

            images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
            cls._make_annotations(
                annotations_dir,
                config_name,
                images_meta=images_meta,
            )

        make_zip(root, f"annotations_trainval{year}.zip", annotations_dir)

        return num_samples


602
@DATASET_MOCKS.append_named_callable
Philip Meier's avatar
Philip Meier committed
603
def coco(info, root, config):
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
    return dict(
        zip(
            [config_ for config_ in info._configs if config_.year == config.year],
            itertools.repeat(CocoMockData.generate(root, year=config.year, num_samples=5)),
        )
    )


def config_id(name, config):
    parts = [name]
    for name, value in config.items():
        if isinstance(value, bool):
            part = ("" if value else "no_") + name
        else:
            part = str(value)
        parts.append(part)
    return "-".join(parts)


def parametrize_dataset_mocks(datasets_mocks):
    return pytest.mark.parametrize(
        ("dataset_mock", "config"),
        [
            pytest.param(dataset_mock, config, id=config_id(dataset_mock.name, config))
            for dataset_mock in datasets_mocks
            for config in dataset_mock.configs
        ],
    )