merge v0.16.0

cc26cd81 · panning · f78f29f5 · fbb4cc54 · cc26cd81 · cc26cd81
Commit cc26cd81 authored Nov 27, 2023 by panning
20 changed files
--- a/references/optical_flow/transforms.py
+++ b/references/optical_flow/transforms.py
@@ -164,7 +164,7 @@ class RandomResizeAndCrop(torch.nn.Module):
    # The reason we don't rely on RandomResizedCrop is because of a significant
    # difference in the parametrization of both transforms, in particular,
    # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
+    # which leads to fairly different results (and different epe). For more details see
    # https://github.com/pytorch/vision/pull/5026/files#r762932579
    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, stretch_prob=0.8):
        super().__init__()
@@ -196,8 +196,12 @@ class RandomResizeAndCrop(torch.nn.Module):

        if torch.rand(1).item() < self.resize_prob:
            # rescale the images
-            img1 = F.resize(img1, size=(new_h, new_w))
-            img2 = F.resize(img2, size=(new_h, new_w))
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the OF models with antialias=True?
+            img1 = F.resize(img1, size=(new_h, new_w), antialias=False)
+            img2 = F.resize(img2, size=(new_h, new_w), antialias=False)
            if valid_flow_mask is None:
                flow = F.resize(flow, size=(new_h, new_w))
                flow = flow * torch.tensor([scale_x, scale_y])[:, None, None]
@@ -208,7 +212,7 @@ class RandomResizeAndCrop(torch.nn.Module):

        # Note: For sparse datasets (Kitti), the original code uses a "margin"
        # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
+        # We don't, not sure if it matters much
        y0 = torch.randint(0, img1.shape[1] - self.crop_size[0], size=(1,)).item()
        x0 = torch.randint(0, img1.shape[2] - self.crop_size[1], size=(1,)).item()


--- a/references/optical_flow/utils.py
+++ b/references/optical_flow/utils.py
@@ -181,7 +181,7 @@ def sequence_loss(flow_preds, flow_gt, valid_flow_mask, gamma=0.8, max_flow=400)
    if gamma > 1:
        raise ValueError(f"Gamma should be < 1, got {gamma}.")

-    # exlude invalid pixels and extremely large diplacements
+    # exclude invalid pixels and extremely large diplacements
    flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
    valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)

@@ -248,7 +248,7 @@ def setup_ddp(args):
    # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2

    if all(key in os.environ for key in ("LOCAL_RANK", "RANK", "WORLD_SIZE")):
-        # if we're here, the script was called with torchrun. Otherwise
+        # if we're here, the script was called with torchrun. Otherwise,
        # these args will be set already by the run_with_submitit script
        args.local_rank = int(os.environ["LOCAL_RANK"])
        args.rank = int(os.environ["RANK"])

--- a/references/segmentation/coco_utils.py
+++ b/references/segmentation/coco_utils.py
@@ -68,11 +68,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
        # if more than 1k pixels occupied in the image
        return sum(obj["area"] for obj in anno) > 1000

-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
-
    ids = []
    for ds_idx, img_id in enumerate(dataset.ids):
        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -86,7 +81,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
    return dataset


-def get_coco(root, image_set, transforms):
+def get_coco(root, image_set, transforms, use_v2=False):
    PATHS = {
        "train": ("train2017", os.path.join("annotations", "instances_train2017.json")),
        "val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
@@ -94,12 +89,23 @@ def get_coco(root, image_set, transforms):
    }
    CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]

-    transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
-
    img_folder, ann_file = PATHS[image_set]
    img_folder = os.path.join(root, img_folder)
    ann_file = os.path.join(root, ann_file)

+    # The 2 "Compose" below achieve the same thing: converting coco detection
+    # samples into segmentation-compatible samples. They just do it with
+    # slightly different implementations. We could refactor and unify, but
+    # keeping them separate helps keeping the v2 version clean
+    if use_v2:
+        import v2_extras
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"})
+    else:
+        transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)

    if image_set == "train":

--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
 import torch
-import transforms as T
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        import torchvision.tv_tensors
+        import v2_extras
+
+        return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras
+    else:
+        import transforms
+
+        return transforms, None, None


 class SegmentationPresetTrain:
-    def __init__(self, *, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        min_size = int(0.5 * base_size)
-        max_size = int(2.0 * base_size)
+    def __init__(
+        self,
+        *,
+        base_size,
+        crop_size,
+        hflip_prob=0.5,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+        backend="pil",
+        use_v2=False,
+    ):
+        T, tv_tensors, v2_extras = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tv_tensor":
+            transforms.append(T.ToImage())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))]

-        trans = [T.RandomResize(min_size, max_size)]
        if hflip_prob > 0:
-            trans.append(T.RandomHorizontalFlip(hflip_prob))
-        trans.extend(
-            [
-                T.RandomCrop(crop_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
+            transforms += [T.RandomHorizontalFlip(hflip_prob)]
+
+        if use_v2:
+            # We need a custom pad transform here, since the padding we want to perform here is fundamentally
+            # different from the padding in `RandomCrop` if `pad_if_needed=True`.
+            transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})]
+
+        transforms += [T.RandomCrop(crop_size)]
+
+        if backend == "pil":
+            transforms += [T.PILToTensor()]
+
+        if use_v2:
+            img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor
+            transforms += [
+                T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True)
            ]
-        )
-        self.transforms = T.Compose(trans)
+        else:
+            # No need to explicitly convert masks as they're magically int64 already
+            transforms += [T.ToDtype(torch.float, scale=True)]
+
+        transforms += [T.Normalize(mean=mean, std=std)]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)

    def __call__(self, img, target):
        return self.transforms(img, target)


 class SegmentationPresetEval:
-    def __init__(self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        self.transforms = T.Compose(
-            [
-                T.RandomResize(base_size, base_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
+    def __init__(
+        self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False
+    ):
+        T, _, _ = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "tv_tensor":
+            transforms += [T.ToImage()]
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        if use_v2:
+            transforms += [T.Resize(size=(base_size, base_size))]
+        else:
+            transforms += [T.RandomResize(min_size=base_size, max_size=base_size)]
+
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+
+        transforms += [
+            T.ToDtype(torch.float, scale=True),
            T.Normalize(mean=mean, std=std),
        ]
-        )
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)

    def __call__(self, img, target):
        return self.transforms(img, target)
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -14,24 +14,30 @@ from torch.optim.lr_scheduler import PolynomialLR
 from torchvision.transforms import functional as F, InterpolationMode


-def get_dataset(dir_path, name, image_set, transform):
+def get_dataset(args, is_train):
    def sbd(*args, **kwargs):
+        kwargs.pop("use_v2")
        return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)

+    def voc(*args, **kwargs):
+        kwargs.pop("use_v2")
+        return torchvision.datasets.VOCSegmentation(*args, **kwargs)
+
    paths = {
-        "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
-        "voc_aug": (dir_path, sbd, 21),
-        "coco": (dir_path, get_coco, 21),
+        "voc": (args.data_path, voc, 21),
+        "voc_aug": (args.data_path, sbd, 21),
+        "coco": (args.data_path, get_coco, 21),
    }
-    p, ds_fn, num_classes = paths[name]
+    p, ds_fn, num_classes = paths[args.dataset]

-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    image_set = "train" if is_train else "val"
+    ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
    return ds, num_classes


-def get_transform(train, args):
-    if train:
-        return presets.SegmentationPresetTrain(base_size=520, crop_size=480)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2)
    elif args.weights and args.test_only:
        weights = torchvision.models.get_weight(args.weights)
        trans = weights.transforms()
@@ -44,7 +50,7 @@ def get_transform(train, args):

        return preprocessing
    else:
-        return presets.SegmentationPresetEval(base_size=520)
+        return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2)


 def criterion(inputs, target):
@@ -120,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi


 def main(args):
+    if args.backend.lower() != "pil" and not args.use_v2:
+        # TODO: Support tensor backend in V1?
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.")
+    if args.use_v2 and args.dataset != "coco":
+        raise ValueError("v2 is only support supported for coco dataset for now.")
+
    if args.output_dir:
        utils.mkdir(args.output_dir)

@@ -134,8 +146,8 @@ def main(args):
    else:
        torch.backends.cudnn.benchmark = True

-    dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args))
-    dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args))
+    dataset, num_classes = get_dataset(args, is_train=True)
+    dataset_test, _ = get_dataset(args, is_train=False)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
@@ -260,7 +272,7 @@ def get_args_parser(add_help=True):
    parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
    parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
    parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name")
-    parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss")
+    parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss")
    parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
    parser.add_argument(
        "-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
@@ -307,6 +319,8 @@ def get_args_parser(add_help=True):
    # Mixed precision training parameters
    parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")

+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
    return parser



--- a/references/segmentation/transforms.py
+++ b/references/segmentation/transforms.py
@@ -35,7 +35,7 @@ class RandomResize:

    def __call__(self, image, target):
        size = random.randint(self.min_size, self.max_size)
-        image = F.resize(image, size)
+        image = F.resize(image, size, antialias=True)
        target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST)
        return image, target

@@ -81,11 +81,14 @@ class PILToTensor:
        return image, target


-class ConvertImageDtype:
-    def __init__(self, dtype):
+class ToDtype:
+    def __init__(self, dtype, scale=False):
        self.dtype = dtype
+        self.scale = scale

    def __call__(self, image, target):
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
        image = F.convert_image_dtype(image, self.dtype)
        return image, target


--- a/references/segmentation/utils.py
+++ b/references/segmentation/utils.py
@@ -267,9 +267,9 @@ def init_distributed_mode(args):
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ["WORLD_SIZE"])
        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
+    # elif "SLURM_PROCID" in os.environ:
+    #     args.rank = int(os.environ["SLURM_PROCID"])
+    #     args.gpu = args.rank % torch.cuda.device_count()
    elif hasattr(args, "rank"):
        pass
    else:

--- a/references/segmentation/v2_extras.py
+++ b/references/segmentation/v2_extras.py
+"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1."""
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+class PadIfSmaller(v2.Transform):
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = size
+        self.fill = v2._utils._setup_fill_arg(fill)
+
+    def _get_params(self, sample):
+        _, height, width = v2._utils.query_chw(sample)
+        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
+        needs_padding = any(padding)
+        return dict(padding=padding, needs_padding=needs_padding)
+
+    def _transform(self, inpt, params):
+        if not params["needs_padding"]:
+            return inpt
+
+        fill = v2._utils._get_fill(self.fill, type(inpt))
+        fill = v2._utils._convert_fill_arg(fill)
+
+        return v2.functional.pad(inpt, padding=params["padding"], fill=fill)
+
+
+class CocoDetectionToVOCSegmentation(v2.Transform):
+    """Turn samples from datasets.CocoDetection into the same format as VOCSegmentation.
+
+    This is achieved in two steps:
+
+    1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately,
+       the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not
+       present in VOC are dropped and replaced by background.
+    2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual
+       mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where
+       the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation
+       mask while pixels that belong to multiple detection masks are marked as invalid.
+    """
+
+    COCO_TO_VOC_LABEL_MAP = dict(
+        zip(
+            [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72],
+            range(21),
+        )
+    )
+    INVALID_VALUE = 255
+
+    def _coco_detection_masks_to_voc_segmentation_mask(self, target):
+        if "masks" not in target:
+            return None
+
+        instance_masks, instance_labels_coco = target["masks"], target["labels"]
+
+        valid_labels_voc = [
+            (idx, label_voc)
+            for idx, label_coco in enumerate(instance_labels_coco.tolist())
+            if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None
+        ]
+
+        if not valid_labels_voc:
+            return None
+
+        valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc)
+
+        instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8)
+        instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8)
+
+        # Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as
+        # there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step.
+        segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0)
+        segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE
+
+        return segmentation_mask
+
+    def forward(self, image, target):
+        segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target)
+        if segmentation_mask is None:
+            segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8)
+
+        return image, tv_tensors.Mask(segmentation_mask)
--- a/references/similarity/sampler.py
+++ b/references/similarity/sampler.py
@@ -48,7 +48,7 @@ class PKSampler(Sampler):

        # Ensures there are enough classes to sample from
        if len(self.groups) < p:
-            raise ValueError("There are not enought classes to sample from")
+            raise ValueError("There are not enough classes to sample from")

    def __iter__(self):
        # Shuffle samples within groups

--- a/references/video_classification/README.md
+++ b/references/video_classification/README.md
@@ -76,7 +76,7 @@ Input data augmentations at validation time (with optional parameters):
 5. Convert BCHW to CBHW

 This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the
-batch size per GPU. Moreover note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
+batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
 Video resnet models:
 ```
 # number of frames per clip

--- a/references/video_classification/presets.py
+++ b/references/video_classification/presets.py
@@ -15,7 +15,11 @@ class VideoClassificationPresetTrain:
    ):
        trans = [
            transforms.ConvertImageDtype(torch.float32),
-            transforms.Resize(resize_size),
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the video models with antialias=True?
+            transforms.Resize(resize_size, antialias=False),
        ]
        if hflip_prob > 0:
            trans.append(transforms.RandomHorizontalFlip(hflip_prob))
@@ -31,7 +35,11 @@ class VideoClassificationPresetEval:
        self.transforms = transforms.Compose(
            [
                transforms.ConvertImageDtype(torch.float32),
-                transforms.Resize(resize_size),
+                # We hard-code antialias=False to preserve results after we changed
+                # its default from None to True (see
+                # https://github.com/pytorch/vision/pull/7160)
+                # TODO: we could re-train the video models with antialias=True?
+                transforms.Resize(resize_size, antialias=False),
                transforms.Normalize(mean=mean, std=std),
                transforms.CenterCrop(crop_size),
                ConvertBCHWtoCBHW(),

--- a/scripts/download_model_urls.py
+++ b/scripts/download_model_urls.py
+import asyncio
+import sys
+from pathlib import Path
+from time import perf_counter
+from urllib.parse import urlsplit
+
+import aiofiles
+import aiohttp
+from torchvision import models
+from tqdm.asyncio import tqdm
+
+
+async def main(download_root):
+    download_root.mkdir(parents=True, exist_ok=True)
+    urls = {weight.url for name in models.list_models() for weight in iter(models.get_model_weights(name))}
+
+    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=None)) as session:
+        await tqdm.gather(*[download(download_root, session, url) for url in urls])
+
+
+async def download(download_root, session, url):
+    response = await session.get(url, params=dict(source="ci"))
+
+    assert response.ok
+
+    file_name = Path(urlsplit(url).path).name
+    async with aiofiles.open(download_root / file_name, "wb") as f:
+        async for data in response.content.iter_any():
+            await f.write(data)
+
+
+if __name__ == "__main__":
+    download_root = (
+        (Path(sys.argv[1]) if len(sys.argv) > 1 else Path("~/.cache/torch/hub/checkpoints")).expanduser().resolve()
+    )
+    print(f"Downloading model weights to {download_root}")
+    start = perf_counter()
+    asyncio.get_event_loop().run_until_complete(main(download_root))
+    stop = perf_counter()
+    minutes, seconds = divmod(stop - start, 60)
+    print(f"Download took {minutes:2.0f}m {seconds:2.0f}s")
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,7 +2,7 @@
 universal=1

 [metadata]
-license_file = LICENSE
+license_files = LICENSE

 [pep8]
 max-line-length = 120
@@ -10,7 +10,7 @@ max-line-length = 120
 [flake8]
 # note: we ignore all 501s (line too long) anyway as they're taken care of by black
 max-line-length = 120
-ignore = E203, E402, W503, W504, F821, E501
+ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE
 per-file-ignores =
    __init__.py: F401, F403, F405
    ./hubconf.py: F401

--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,6 @@ if os.getenv("PYTORCH_VERSION"):
    pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")

 requirements = [
-    "typing_extensions",
    "numpy",
    "requests",
    pytorch_dep,
@@ -166,10 +165,13 @@ def get_extensions():
        + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
        + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
    )
+    source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm"))

    print("Compiling extensions with following flags:")
    force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
    print(f"  FORCE_CUDA: {force_cuda}")
+    force_mps = os.getenv("FORCE_MPS", "0") == "1"
+    print(f"  FORCE_MPS: {force_mps}")
    debug_mode = os.getenv("DEBUG", "0") == "1"
    print(f"  DEBUG: {debug_mode}")
    use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1"
@@ -231,6 +233,8 @@ def get_extensions():
            define_macros += [("WITH_HIP", None)]
            nvcc_flags = []
        extra_compile_args["nvcc"] = nvcc_flags
+    elif torch.backends.mps.is_available() or force_mps:
+        sources += source_mps

    if sys.platform == "win32":
        define_macros += [("torchvision_EXPORTS", None)]
@@ -247,6 +251,9 @@ def get_extensions():
            extra_compile_args["nvcc"] = [f for f in nvcc_flags if not ("-O" in f or "-g" in f)]
            extra_compile_args["nvcc"].append("-O0")
            extra_compile_args["nvcc"].append("-g")
+    else:
+        print("Compiling with debug mode OFF")
+        extra_compile_args["cxx"].append("-g0")

    sources = [os.path.join(extensions_dir, s) for s in sources]

@@ -327,6 +334,8 @@ def get_extensions():
    use_jpeg = use_jpeg and jpeg_found
    if use_jpeg:
        print("Building torchvision with JPEG image support")
+        print(f"  libjpeg include path: {jpeg_include}")
+        print(f"  libjpeg lib path: {jpeg_lib}")
        image_link_flags.append("jpeg")
        if jpeg_conda:
            image_library += [jpeg_lib]
@@ -352,11 +361,14 @@ def get_extensions():
    image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))]

    image_path = os.path.join(extensions_dir, "io", "image")
-    image_src = (
-        glob.glob(os.path.join(image_path, "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cuda", "*.cpp"))
-    )
+    image_src = glob.glob(os.path.join(image_path, "*.cpp")) + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
+
+    if is_rocm_pytorch:
+        image_src += glob.glob(os.path.join(image_path, "hip", "*.cpp"))
+        # we need to exclude this in favor of the hipified source
+        image_src.remove(os.path.join(image_path, "image.cpp"))
+    else:
+        image_src += glob.glob(os.path.join(image_path, "cuda", "*.cpp"))

    if use_png or use_jpeg:
        ext_modules.append(
@@ -464,8 +476,8 @@ def get_extensions():
                    "swresample",
                    "swscale",
                ],
-                extra_compile_args=["-std=c++14"] if os.name != "nt" else ["/std:c++14", "/MP"],
-                extra_link_args=["-std=c++14" if os.name != "nt" else "/std:c++14"],
+                extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"],
+                extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"],
            )
        )

@@ -564,6 +576,7 @@ if __name__ == "__main__":
        url="https://github.com/pytorch/vision",
        description="image and video datasets and models for torch deep learning",
        long_description=readme,
+        long_description_content_type="text/markdown",
        license="BSD",
        # Package info
        packages=find_packages(exclude=("test",)),
@@ -574,7 +587,7 @@ if __name__ == "__main__":
            "scipy": ["scipy"],
        },
        ext_modules=get_extensions(),
-        python_requires=">=3.7",
+        python_requires=">=3.8",
        cmdclass={
            "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
            "clean": clean,

--- a/test/assets/toosmall_png/heapbof.png
+++ b/test/assets/toosmall_png/heapbof.png
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
+import bz2
+import collections.abc
+import csv
+import functools
+import gzip
+import io
+import itertools
+import json
+import lzma
+import pathlib
+import pickle
+import random
+import shutil
+import unittest.mock
+import xml.etree.ElementTree as ET
+from collections import Counter, defaultdict
+
+import numpy as np
+import pytest
+import torch
+from common_utils import combinations_grid
+from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
+from torch.nn.functional import one_hot
+from torch.testing import make_tensor as _make_tensor
+from torchvision.prototype import datasets
+
+make_tensor = functools.partial(_make_tensor, device="cpu")
+make_scalar = functools.partial(make_tensor, ())
+
+
+__all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"]
+
+
+class DatasetMock:
+    def __init__(self, name, *, mock_data_fn, configs):
+        # FIXME: error handling for unknown names
+        self.name = name
+        self.mock_data_fn = mock_data_fn
+        self.configs = configs
+
+    def _parse_mock_info(self, mock_info):
+        if mock_info is None:
+            raise pytest.UsageError(
+                f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an "
+                f"integer indicating the number of samples for the current `config`."
+            )
+        elif isinstance(mock_info, int):
+            mock_info = dict(num_samples=mock_info)
+        elif not isinstance(mock_info, dict):
+            raise pytest.UsageError(
+                f"The mock data function for dataset '{self.name}' returned a {type(mock_info)}. The returned object "
+                f"should be a dictionary containing at least the number of samples for the key `'num_samples'`. If no "
+                f"additional information is required for specific tests, the number of samples can also be returned as "
+                f"an integer."
+            )
+        elif "num_samples" not in mock_info:
+            raise pytest.UsageError(
+                f"The dictionary returned by the mock data function for dataset '{self.name}' has to contain a "
+                f"`'num_samples'` entry indicating the number of samples."
+            )
+
+        return mock_info
+
+    def load(self, config):
+        # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
+        # test/test_prototype_builtin_datasets.py
+        root = pathlib.Path(datasets.home()) / self.name
+        # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
+        # this will only download **and** preprocess if the file is not present. In other words, if we already place
+        # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
+        # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
+        # `root` only when it is requested.
+        tmp_mock_data_folder = root / "__mock__"
+        tmp_mock_data_folder.mkdir(parents=True)
+
+        mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
+
+        def patched_download(resource, root, **kwargs):
+            src = tmp_mock_data_folder / resource.file_name
+            if not src.exists():
+                raise pytest.UsageError(
+                    f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
+                    f"but it was not created by the mock data function."
+                )
+
+            dst = root / resource.file_name
+            shutil.move(str(src), str(root))
+
+            return dst
+
+        with unittest.mock.patch(
+            "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
+        ):
+            dataset = datasets.load(self.name, **config)
+
+        extra_files = list(tmp_mock_data_folder.glob("**/*"))
+        if extra_files:
+            raise pytest.UsageError(
+                (
+                    f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
+                    f"but they were not loaded:\n\n"
+                )
+                + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
+            )
+
+        tmp_mock_data_folder.rmdir()
+
+        return dataset, mock_info
+
+
+def config_id(name, config):
+    parts = [name]
+    for name, value in config.items():
+        if isinstance(value, bool):
+            part = ("" if value else "no_") + name
+        else:
+            part = str(value)
+        parts.append(part)
+    return "-".join(parts)
+
+
+def parametrize_dataset_mocks(*dataset_mocks, marks=None):
+    mocks = {}
+    for mock in dataset_mocks:
+        if isinstance(mock, DatasetMock):
+            mocks[mock.name] = mock
+        elif isinstance(mock, collections.abc.Mapping):
+            mocks.update(mock)
+        else:
+            raise pytest.UsageError(
+                f"The positional arguments passed to `parametrize_dataset_mocks` can either be a `DatasetMock`, "
+                f"a sequence of `DatasetMock`'s, or a mapping of names to `DatasetMock`'s, "
+                f"but got {mock} instead."
+            )
+    dataset_mocks = mocks
+
+    if marks is None:
+        marks = {}
+    elif not isinstance(marks, collections.abc.Mapping):
+        raise pytest.UsageError()
+
+    return pytest.mark.parametrize(
+        ("dataset_mock", "config"),
+        [
+            pytest.param(dataset_mock, config, id=config_id(name, config), marks=marks.get(name, ()))
+            for name, dataset_mock in dataset_mocks.items()
+            for config in dataset_mock.configs
+        ],
+    )
+
+
+DATASET_MOCKS = {}
+
+
+def register_mock(name=None, *, configs):
+    def wrapper(mock_data_fn):
+        nonlocal name
+        if name is None:
+            name = mock_data_fn.__name__
+        DATASET_MOCKS[name] = DatasetMock(name, mock_data_fn=mock_data_fn, configs=configs)
+
+        return mock_data_fn
+
+    return wrapper
+
+
+class MNISTMockData:
+    _DTYPES_ID = {
+        torch.uint8: 8,
+        torch.int8: 9,
+        torch.int16: 11,
+        torch.int32: 12,
+        torch.float32: 13,
+        torch.float64: 14,
+    }
+
+    @classmethod
+    def _magic(cls, dtype, ndim):
+        return cls._DTYPES_ID[dtype] * 256 + ndim + 1
+
+    @staticmethod
+    def _encode(t):
+        return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1]
+
+    @staticmethod
+    def _big_endian_dtype(dtype):
+        np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype
+        return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}")
+
+    @classmethod
+    def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high):
+        with compressor(root / filename, "wb") as fh:
+            for meta in (cls._magic(dtype, len(shape)), num_samples, *shape):
+                fh.write(cls._encode(meta))
+
+            data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high)
+
+            fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes())
+
+    @classmethod
+    def generate(
+        cls,
+        root,
+        *,
+        num_categories,
+        num_samples=None,
+        images_file,
+        labels_file,
+        image_size=(28, 28),
+        image_dtype=torch.uint8,
+        label_size=(),
+        label_dtype=torch.uint8,
+        compressor=None,
+    ):
+        if num_samples is None:
+            num_samples = num_categories
+        if compressor is None:
+            compressor = gzip.open
+
+        cls._create_binary_file(
+            root,
+            images_file,
+            num_samples=num_samples,
+            shape=image_size,
+            dtype=image_dtype,
+            compressor=compressor,
+            high=float("inf"),
+        )
+        cls._create_binary_file(
+            root,
+            labels_file,
+            num_samples=num_samples,
+            shape=label_size,
+            dtype=label_dtype,
+            compressor=compressor,
+            high=num_categories,
+        )
+
+        return num_samples
+
+
+def mnist(root, config):
+    prefix = "train" if config["split"] == "train" else "t10k"
+    return MNISTMockData.generate(
+        root,
+        num_categories=10,
+        images_file=f"{prefix}-images-idx3-ubyte.gz",
+        labels_file=f"{prefix}-labels-idx1-ubyte.gz",
+    )
+
+
+DATASET_MOCKS.update(
+    {
+        name: DatasetMock(name, mock_data_fn=mnist, configs=combinations_grid(split=("train", "test")))
+        for name in ["mnist", "fashionmnist", "kmnist"]
+    }
+)
+
+
+@register_mock(
+    configs=combinations_grid(
+        split=("train", "test"),
+        image_set=("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
+    )
+)
+def emnist(root, config):
+    num_samples_map = {}
+    file_names = set()
+    for split, image_set in itertools.product(
+        ("train", "test"),
+        ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
+    ):
+        prefix = f"emnist-{image_set.replace('_', '').lower()}-{split}"
+        images_file = f"{prefix}-images-idx3-ubyte.gz"
+        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
+        file_names.update({images_file, labels_file})
+        num_samples_map[(split, image_set)] = MNISTMockData.generate(
+            root,
+            # The image sets that merge some lower case letters in their respective upper case variant, still use dense
+            # labels in the data files. Thus, num_categories != len(categories) there.
+            num_categories=47 if config["image_set"] in ("Balanced", "By_Merge") else 62,
+            images_file=images_file,
+            labels_file=labels_file,
+        )
+
+    make_zip(root, "emnist-gzip.zip", *file_names)
+
+    return num_samples_map[(config["split"], config["image_set"])]
+
+
+@register_mock(configs=combinations_grid(split=("train", "test", "test10k", "test50k", "nist")))
+def qmnist(root, config):
+    num_categories = 10
+    if config["split"] == "train":
+        num_samples = num_samples_gen = num_categories + 2
+        prefix = "qmnist-train"
+        suffix = ".gz"
+        compressor = gzip.open
+    elif config["split"].startswith("test"):
+        # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create
+        # more than 10000 images for the dataset to not be empty.
+        num_samples_gen = 10001
+        num_samples = {
+            "test": num_samples_gen,
+            "test10k": min(num_samples_gen, 10_000),
+            "test50k": num_samples_gen - 10_000,
+        }[config["split"]]
+        prefix = "qmnist-test"
+        suffix = ".gz"
+        compressor = gzip.open
+    else:  # config["split"] == "nist"
+        num_samples = num_samples_gen = num_categories + 3
+        prefix = "xnist"
+        suffix = ".xz"
+        compressor = lzma.open
+
+    MNISTMockData.generate(
+        root,
+        num_categories=num_categories,
+        num_samples=num_samples_gen,
+        images_file=f"{prefix}-images-idx3-ubyte{suffix}",
+        labels_file=f"{prefix}-labels-idx2-int{suffix}",
+        label_size=(8,),
+        label_dtype=torch.int32,
+        compressor=compressor,
+    )
+    return num_samples
+
+
+class CIFARMockData:
+    NUM_PIXELS = 32 * 32 * 3
+
+    @classmethod
+    def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1):
+        content = {
+            "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(),
+            labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(),
+        }
+        with open(pathlib.Path(root) / name, "wb") as fh:
+            pickle.dump(content, fh)
+
+    @classmethod
+    def generate(
+        cls,
+        root,
+        name,
+        *,
+        folder,
+        train_files,
+        test_files,
+        num_categories,
+        labels_key,
+    ):
+        folder = root / folder
+        folder.mkdir()
+        files = (*train_files, *test_files)
+        for file in files:
+            cls._create_batch_file(
+                folder,
+                file,
+                num_categories=num_categories,
+                labels_key=labels_key,
+            )
+
+        make_tar(root, name, folder, compression="gz")
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def cifar10(root, config):
+    train_files = [f"data_batch_{idx}" for idx in range(1, 6)]
+    test_files = ["test_batch"]
+
+    CIFARMockData.generate(
+        root=root,
+        name="cifar-10-python.tar.gz",
+        folder=pathlib.Path("cifar-10-batches-py"),
+        train_files=train_files,
+        test_files=test_files,
+        num_categories=10,
+        labels_key="labels",
+    )
+
+    return len(train_files if config["split"] == "train" else test_files)
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def cifar100(root, config):
+    train_files = ["train"]
+    test_files = ["test"]
+
+    CIFARMockData.generate(
+        root=root,
+        name="cifar-100-python.tar.gz",
+        folder=pathlib.Path("cifar-100-python"),
+        train_files=train_files,
+        test_files=test_files,
+        num_categories=100,
+        labels_key="fine_labels",
+    )
+
+    return len(train_files if config["split"] == "train" else test_files)
+
+
+@register_mock(configs=[dict()])
+def caltech101(root, config):
+    def create_ann_file(root, name):
+        import scipy.io
+
+        box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16)
+        obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy()
+
+        scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour))
+
+    def create_ann_folder(root, name, file_name_fn, num_examples):
+        root = pathlib.Path(root) / name
+        root.mkdir(parents=True)
+
+        for idx in range(num_examples):
+            create_ann_file(root, file_name_fn(idx))
+
+    images_root = root / "101_ObjectCategories"
+    anns_root = root / "Annotations"
+
+    image_category_map = {
+        "Faces": "Faces_2",
+        "Faces_easy": "Faces_3",
+        "Motorbikes": "Motorbikes_16",
+        "airplanes": "Airplanes_Side_2",
+    }
+
+    categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"]
+
+    num_images_per_category = 2
+    for category in categories:
+        create_image_folder(
+            root=images_root,
+            name=category,
+            file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
+            num_examples=num_images_per_category,
+        )
+        create_ann_folder(
+            root=anns_root,
+            name=image_category_map.get(category, category),
+            file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
+            num_examples=num_images_per_category,
+        )
+
+    (images_root / "BACKGROUND_Goodle").mkdir()
+    make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")
+
+    make_tar(root, f"{anns_root.name}.tar", anns_root)
+
+    return num_images_per_category * len(categories)
+
+
+@register_mock(configs=[dict()])
+def caltech256(root, config):
+    dir = root / "256_ObjectCategories"
+    num_images_per_category = 2
+
+    categories = [
+        (1, "ak47"),
+        (127, "laptop-101"),
+        (198, "spider"),
+        (257, "clutter"),
+    ]
+
+    for category_idx, category in categories:
+        files = create_image_folder(
+            dir,
+            name=f"{category_idx:03d}.{category}",
+            file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg",
+            num_examples=num_images_per_category,
+        )
+        if category == "spider":
+            open(files[0].parent / "RENAME2", "w").close()
+
+    make_tar(root, f"{dir.name}.tar", dir)
+
+    return num_images_per_category * len(categories)
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def imagenet(root, config):
+    from scipy.io import savemat
+
+    info = datasets.info("imagenet")
+
+    if config["split"] == "train":
+        num_samples = len(info["wnids"])
+        archive_name = "ILSVRC2012_img_train.tar"
+
+        files = []
+        for wnid in info["wnids"]:
+            create_image_folder(
+                root=root,
+                name=wnid,
+                file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
+                num_examples=1,
+            )
+            files.append(make_tar(root, f"{wnid}.tar"))
+    elif config["split"] == "val":
+        num_samples = 3
+        archive_name = "ILSVRC2012_img_val.tar"
+        files = [create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
+
+        devkit_root = root / "ILSVRC2012_devkit_t12"
+        data_root = devkit_root / "data"
+        data_root.mkdir(parents=True)
+
+        with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file:
+            for label in torch.randint(0, len(info["wnids"]), (num_samples,)).tolist():
+                file.write(f"{label}\n")
+
+        num_children = 0
+        synsets = [
+            (idx, wnid, category, "", num_children, [], 0, 0)
+            for idx, (category, wnid) in enumerate(zip(info["categories"], info["wnids"]), 1)
+        ]
+        num_children = 1
+        synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
+        synsets = np.array(
+            synsets,
+            dtype=np.dtype(
+                [
+                    ("ILSVRC2012_ID", "O"),
+                    ("WNID", "O"),
+                    ("words", "O"),
+                    ("gloss", "O"),
+                    ("num_children", "O"),
+                    ("children", "O"),
+                    ("wordnet_height", "O"),
+                    ("num_train_images", "O"),
+                ]
+            ),
+        )
+        savemat(data_root / "meta.mat", dict(synsets=synsets))
+
+        make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
+    else:  # config["split"] == "test"
+        num_samples = 5
+        archive_name = "ILSVRC2012_img_test_v10102019.tar"
+        files = [create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
+
+    make_tar(root, archive_name, *files)
+
+    return num_samples
+
+
+class CocoMockData:
+    @classmethod
+    def _make_annotations_json(
+        cls,
+        root,
+        name,
+        *,
+        images_meta,
+        fn,
+    ):
+        num_anns_per_image = torch.randint(1, 5, (len(images_meta),))
+        num_anns_total = int(num_anns_per_image.sum())
+        ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)])
+
+        anns_meta = []
+        for image_meta, num_anns in zip(images_meta, num_anns_per_image):
+            for _ in range(num_anns):
+                ann_id = int(next(ann_ids_iter))
+                anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"]))
+        anns_meta.sort(key=lambda ann: ann["id"])
+
+        with open(root / name, "w") as file:
+            json.dump(dict(images=images_meta, annotations=anns_meta), file)
+
+        return num_anns_per_image
+
+    @staticmethod
+    def _make_instances_data(ann_id, image_meta):
+        def make_rle_segmentation():
+            height, width = image_meta["height"], image_meta["width"]
+            numel = height * width
+            counts = []
+            while sum(counts) <= numel:
+                counts.append(int(torch.randint(5, 8, ())))
+            if sum(counts) > numel:
+                counts[-1] -= sum(counts) - numel
+            return dict(counts=counts, size=[height, width])
+
+        return dict(
+            segmentation=make_rle_segmentation(),
+            bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(),
+            iscrowd=True,
+            area=float(make_scalar(dtype=torch.float32)),
+            category_id=int(make_scalar(dtype=torch.int64)),
+        )
+
+    @staticmethod
+    def _make_captions_data(ann_id, image_meta):
+        return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.")
+
+    @classmethod
+    def _make_annotations(cls, root, name, *, images_meta):
+        num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64)
+        for annotations, fn in (
+            ("instances", cls._make_instances_data),
+            ("captions", cls._make_captions_data),
+        ):
+            num_anns_per_image += cls._make_annotations_json(
+                root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn
+            )
+
+        return int(num_anns_per_image.sum())
+
+    @classmethod
+    def generate(
+        cls,
+        root,
+        *,
+        split,
+        year,
+        num_samples,
+    ):
+        annotations_dir = root / "annotations"
+        annotations_dir.mkdir()
+
+        for split_ in ("train", "val"):
+            config_name = f"{split_}{year}"
+
+            images_meta = [
+                dict(
+                    file_name=f"{idx:012d}.jpg",
+                    id=idx,
+                    width=width,
+                    height=height,
+                )
+                for idx, (height, width) in enumerate(
+                    torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
+                )
+            ]
+
+            if split_ == split:
+                create_image_folder(
+                    root,
+                    config_name,
+                    file_name_fn=lambda idx: images_meta[idx]["file_name"],
+                    num_examples=num_samples,
+                    size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
+                )
+                make_zip(root, f"{config_name}.zip")
+
+            cls._make_annotations(
+                annotations_dir,
+                config_name,
+                images_meta=images_meta,
+            )
+
+        make_zip(root, f"annotations_trainval{year}.zip", annotations_dir)
+
+        return num_samples
+
+
+@register_mock(
+    configs=combinations_grid(
+        split=("train", "val"),
+        year=("2017", "2014"),
+        annotations=("instances", "captions", None),
+    )
+)
+def coco(root, config):
+    return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
+
+
+class SBDMockData:
+    _NUM_CATEGORIES = 20
+
+    @classmethod
+    def _make_split_files(cls, root_map, *, split):
+        splits_and_idcs = [
+            ("train", [0, 1, 2]),
+            ("val", [3]),
+        ]
+        if split == "train_noval":
+            splits_and_idcs.append(("train_noval", [0, 2]))
+
+        ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs}
+
+        for split, ids in ids_map.items():
+            with open(root_map[split] / f"{split}.txt", "w") as fh:
+                fh.writelines(f"{id}\n" for id in ids)
+
+        return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
+
+    @classmethod
+    def _make_anns_folder(cls, root, name, ids):
+        from scipy.io import savemat
+
+        anns_folder = root / name
+        anns_folder.mkdir()
+
+        sizes = torch.randint(1, 9, size=(len(ids), 2)).tolist()
+        for id, size in zip(ids, sizes):
+            savemat(
+                anns_folder / f"{id}.mat",
+                {
+                    "GTcls": {
+                        "Boundaries": cls._make_boundaries(size),
+                        "Segmentation": cls._make_segmentation(size),
+                    }
+                },
+            )
+        return sizes
+
+    @classmethod
+    def _make_boundaries(cls, size):
+        from scipy.sparse import csc_matrix
+
+        return [
+            [csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())] for _ in range(cls._NUM_CATEGORIES)
+        ]
+
+    @classmethod
+    def _make_segmentation(cls, size):
+        return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
+
+    @classmethod
+    def generate(cls, root, *, split):
+        archive_folder = root / "benchmark_RELEASE"
+        dataset_folder = archive_folder / "dataset"
+        dataset_folder.mkdir(parents=True, exist_ok=True)
+
+        ids, num_samples_map = cls._make_split_files(
+            defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split
+        )
+        sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
+        create_image_folder(
+            dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
+        )
+
+        make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
+
+        return num_samples_map[split]
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
+def sbd(root, config):
+    return SBDMockData.generate(root, split=config["split"])
+
+
+@register_mock(configs=[dict()])
+def semeion(root, config):
+    num_samples = 3
+    num_categories = 10
+
+    images = torch.rand(num_samples, 256)
+    labels = one_hot(torch.randint(num_categories, size=(num_samples,)), num_classes=num_categories)
+    with open(root / "semeion.data", "w") as fh:
+        for image, one_hot_label in zip(images, labels):
+            image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
+            labels_columns = " ".join([str(label.item()) for label in one_hot_label])
+            fh.write(f"{image_columns} {labels_columns} \n")
+
+    return num_samples
+
+
+class VOCMockData:
+    _TRAIN_VAL_FILE_NAMES = {
+        "2007": "VOCtrainval_06-Nov-2007.tar",
+        "2008": "VOCtrainval_14-Jul-2008.tar",
+        "2009": "VOCtrainval_11-May-2009.tar",
+        "2010": "VOCtrainval_03-May-2010.tar",
+        "2011": "VOCtrainval_25-May-2011.tar",
+        "2012": "VOCtrainval_11-May-2012.tar",
+    }
+    _TEST_FILE_NAMES = {
+        "2007": "VOCtest_06-Nov-2007.tar",
+    }
+
+    @classmethod
+    def _make_split_files(cls, root, *, year, trainval):
+        split_folder = root / "ImageSets"
+
+        if trainval:
+            idcs_map = {
+                "train": [0, 1, 2],
+                "val": [3, 4],
+            }
+            idcs_map["trainval"] = [*idcs_map["train"], *idcs_map["val"]]
+        else:
+            idcs_map = {
+                "test": [5],
+            }
+        ids_map = {split: [f"{year}_{idx:06d}" for idx in idcs] for split, idcs in idcs_map.items()}
+
+        for task_sub_folder in ("Main", "Segmentation"):
+            task_folder = split_folder / task_sub_folder
+            task_folder.mkdir(parents=True, exist_ok=True)
+            for split, ids in ids_map.items():
+                with open(task_folder / f"{split}.txt", "w") as fh:
+                    fh.writelines(f"{id}\n" for id in ids)
+
+        return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
+
+    @classmethod
+    def _make_detection_anns_folder(cls, root, name, *, file_name_fn, num_examples):
+        folder = root / name
+        folder.mkdir(parents=True, exist_ok=True)
+
+        for idx in range(num_examples):
+            cls._make_detection_ann_file(folder, file_name_fn(idx))
+
+    @classmethod
+    def _make_detection_ann_file(cls, root, name):
+        def add_child(parent, name, text=None):
+            child = ET.SubElement(parent, name)
+            child.text = str(text)
+            return child
+
+        def add_name(obj, name="dog"):
+            add_child(obj, "name", name)
+
+        def add_size(obj):
+            obj = add_child(obj, "size")
+            size = {"width": 0, "height": 0, "depth": 3}
+            for name, text in size.items():
+                add_child(obj, name, text)
+
+        def add_bndbox(obj):
+            obj = add_child(obj, "bndbox")
+            bndbox = {"xmin": 1, "xmax": 2, "ymin": 3, "ymax": 4}
+            for name, text in bndbox.items():
+                add_child(obj, name, text)
+
+        annotation = ET.Element("annotation")
+        add_size(annotation)
+        obj = add_child(annotation, "object")
+        add_name(obj)
+        add_bndbox(obj)
+
+        with open(root / name, "wb") as fh:
+            fh.write(ET.tostring(annotation))
+
+    @classmethod
+    def generate(cls, root, *, year, trainval):
+        archive_folder = root
+        if year == "2011":
+            archive_folder = root / "TrainVal"
+            data_folder = archive_folder / "VOCdevkit"
+        else:
+            archive_folder = data_folder = root / "VOCdevkit"
+        data_folder = data_folder / f"VOC{year}"
+        data_folder.mkdir(parents=True, exist_ok=True)
+
+        ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
+        for make_folder_fn, name, suffix in [
+            (create_image_folder, "JPEGImages", ".jpg"),
+            (create_image_folder, "SegmentationClass", ".png"),
+            (cls._make_detection_anns_folder, "Annotations", ".xml"),
+        ]:
+            make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
+        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
+
+        return num_samples_map
+
+
+@register_mock(
+    configs=[
+        *combinations_grid(
+            split=("train", "val", "trainval"),
+            year=("2007", "2008", "2009", "2010", "2011", "2012"),
+            task=("detection", "segmentation"),
+        ),
+        *combinations_grid(
+            split=("test",),
+            year=("2007",),
+            task=("detection", "segmentation"),
+        ),
+    ],
+)
+def voc(root, config):
+    trainval = config["split"] != "test"
+    return VOCMockData.generate(root, year=config["year"], trainval=trainval)[config["split"]]
+
+
+class CelebAMockData:
+    @classmethod
+    def _make_ann_file(cls, root, name, data, *, field_names=None):
+        with open(root / name, "w") as file:
+            if field_names:
+                file.write(f"{len(data)}\r\n")
+                file.write(" ".join(field_names) + "\r\n")
+            file.writelines(" ".join(str(item) for item in row) + "\r\n" for row in data)
+
+    _SPLIT_TO_IDX = {
+        "train": 0,
+        "val": 1,
+        "test": 2,
+    }
+
+    @classmethod
+    def _make_split_file(cls, root):
+        num_samples_map = {"train": 4, "val": 3, "test": 2}
+
+        data = [
+            (f"{idx:06d}.jpg", cls._SPLIT_TO_IDX[split])
+            for split, num_samples in num_samples_map.items()
+            for idx in range(num_samples)
+        ]
+        cls._make_ann_file(root, "list_eval_partition.txt", data)
+
+        image_file_names, _ = zip(*data)
+        return image_file_names, num_samples_map
+
+    @classmethod
+    def _make_identity_file(cls, root, image_file_names):
+        cls._make_ann_file(
+            root, "identity_CelebA.txt", [(name, int(make_scalar(low=1, dtype=torch.int))) for name in image_file_names]
+        )
+
+    @classmethod
+    def _make_attributes_file(cls, root, image_file_names):
+        field_names = ("5_o_Clock_Shadow", "Young")
+        data = [
+            [name, *[" 1" if attr else "-1" for attr in make_tensor((len(field_names),), dtype=torch.bool)]]
+            for name in image_file_names
+        ]
+        cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, ""))
+
+    @classmethod
+    def _make_bounding_boxes_file(cls, root, image_file_names):
+        field_names = ("image_id", "x_1", "y_1", "width", "height")
+        data = [
+            [f"{name}  ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]]
+            for name in image_file_names
+        ]
+        cls._make_ann_file(root, "list_bbox_celeba.txt", data, field_names=field_names)
+
+    @classmethod
+    def _make_landmarks_file(cls, root, image_file_names):
+        field_names = ("lefteye_x", "lefteye_y", "rightmouth_x", "rightmouth_y")
+        data = [
+            [
+                name,
+                *[
+                    f"{coord:4d}" if idx else coord
+                    for idx, coord in enumerate(make_tensor((len(field_names),), low=0, dtype=torch.int).tolist())
+                ],
+            ]
+            for name in image_file_names
+        ]
+        cls._make_ann_file(root, "list_landmarks_align_celeba.txt", data, field_names=field_names)
+
+    @classmethod
+    def generate(cls, root):
+        image_file_names, num_samples_map = cls._make_split_file(root)
+
+        image_files = create_image_folder(
+            root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names)
+        )
+        make_zip(root, image_files[0].parent.with_suffix(".zip").name)
+
+        for make_ann_file_fn in (
+            cls._make_identity_file,
+            cls._make_attributes_file,
+            cls._make_bounding_boxes_file,
+            cls._make_landmarks_file,
+        ):
+            make_ann_file_fn(root, image_file_names)
+
+        return num_samples_map
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def celeba(root, config):
+    return CelebAMockData.generate(root)[config["split"]]
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def country211(root, config):
+    split_folder = pathlib.Path(root, "country211", "valid" if config["split"] == "val" else config["split"])
+    split_folder.mkdir(parents=True, exist_ok=True)
+
+    num_examples = {
+        "train": 3,
+        "val": 4,
+        "test": 5,
+    }[config["split"]]
+
+    classes = ("AD", "BS", "GR")
+    for cls in classes:
+        create_image_folder(
+            split_folder,
+            name=cls,
+            file_name_fn=lambda idx: f"{idx}.jpg",
+            num_examples=num_examples,
+        )
+    make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
+    return num_examples * len(classes)
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def food101(root, config):
+    data_folder = root / "food-101"
+
+    num_images_per_class = 3
+    image_folder = data_folder / "images"
+    categories = ["apple_pie", "baby_back_ribs", "waffles"]
+    image_ids = []
+    for category in categories:
+        image_files = create_image_folder(
+            image_folder,
+            category,
+            file_name_fn=lambda idx: f"{idx:04d}.jpg",
+            num_examples=num_images_per_class,
+        )
+        image_ids.extend(path.relative_to(path.parents[1]).with_suffix("").as_posix() for path in image_files)
+
+    meta_folder = data_folder / "meta"
+    meta_folder.mkdir()
+
+    with open(meta_folder / "classes.txt", "w") as file:
+        for category in categories:
+            file.write(f"{category}\n")
+
+    splits = ["train", "test"]
+    num_samples_map = {}
+    for offset, split in enumerate(splits):
+        image_ids_in_split = image_ids[offset :: len(splits)]
+        num_samples_map[split] = len(image_ids_in_split)
+        with open(meta_folder / f"{split}.txt", "w") as file:
+            for image_id in image_ids_in_split:
+                file.write(f"{image_id}\n")
+
+    make_tar(root, f"{data_folder.name}.tar.gz", compression="gz")
+
+    return num_samples_map[config["split"]]
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test"), fold=(1, 4, 10)))
+def dtd(root, config):
+    data_folder = root / "dtd"
+
+    num_images_per_class = 3
+    image_folder = data_folder / "images"
+    categories = {"banded", "marbled", "zigzagged"}
+    image_ids_per_category = {
+        category: [
+            str(path.relative_to(path.parents[1]).as_posix())
+            for path in create_image_folder(
+                image_folder,
+                category,
+                file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg",
+                num_examples=num_images_per_class,
+            )
+        ]
+        for category in categories
+    }
+
+    meta_folder = data_folder / "labels"
+    meta_folder.mkdir()
+
+    with open(meta_folder / "labels_joint_anno.txt", "w") as file:
+        for cls, image_ids in image_ids_per_category.items():
+            for image_id in image_ids:
+                joint_categories = random.choices(
+                    list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))
+                )
+                file.write(" ".join([image_id, *sorted([cls, *joint_categories])]) + "\n")
+
+    image_ids = list(itertools.chain(*image_ids_per_category.values()))
+    splits = ("train", "val", "test")
+    num_samples_map = {}
+    for fold in range(1, 11):
+        random.shuffle(image_ids)
+        for offset, split in enumerate(splits):
+            image_ids_in_config = image_ids[offset :: len(splits)]
+            with open(meta_folder / f"{split}{fold}.txt", "w") as file:
+                file.write("\n".join(image_ids_in_config) + "\n")
+
+            num_samples_map[(split, fold)] = len(image_ids_in_config)
+
+    make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz")
+
+    return num_samples_map[config["split"], config["fold"]]
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def fer2013(root, config):
+    split = config["split"]
+    num_samples = 5 if split == "train" else 3
+
+    path = root / f"{split}.csv"
+    with open(path, "w", newline="") as file:
+        field_names = ["emotion"] if split == "train" else []
+        field_names.append("pixels")
+
+        file.write(",".join(field_names) + "\n")
+
+        writer = csv.DictWriter(file, fieldnames=field_names, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+        for _ in range(num_samples):
+            rowdict = {
+                "pixels": " ".join([str(int(pixel)) for pixel in torch.randint(256, (48 * 48,), dtype=torch.uint8)])
+            }
+            if split == "train":
+                rowdict["emotion"] = int(torch.randint(7, ()))
+            writer.writerow(rowdict)
+
+    make_zip(root, f"{path.name}.zip", path)
+
+    return num_samples
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def gtsrb(root, config):
+    num_examples_per_class = 5 if config["split"] == "train" else 3
+    classes = ("00000", "00042", "00012")
+    num_examples = num_examples_per_class * len(classes)
+
+    csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
+
+    def _make_ann_file(path, num_examples, class_idx):
+        if class_idx == "random":
+            class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
+
+        with open(path, "w") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
+            writer.writeheader()
+            for image_idx in range(num_examples):
+                writer.writerow(
+                    {
+                        "Filename": f"{image_idx:05d}.ppm",
+                        "Width": torch.randint(1, 100, size=()).item(),
+                        "Height": torch.randint(1, 100, size=()).item(),
+                        "Roi.X1": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
+                        "Roi.X2": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
+                        "ClassId": class_idx,
+                    }
+                )
+
+    archive_folder = root / "GTSRB"
+
+    if config["split"] == "train":
+        train_folder = archive_folder / "Training"
+        train_folder.mkdir(parents=True)
+
+        for class_idx in classes:
+            create_image_folder(
+                train_folder,
+                name=class_idx,
+                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                num_examples=num_examples_per_class,
+            )
+            _make_ann_file(
+                path=train_folder / class_idx / f"GT-{class_idx}.csv",
+                num_examples=num_examples_per_class,
+                class_idx=int(class_idx),
+            )
+        make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
+    else:
+        test_folder = archive_folder / "Final_Test"
+        test_folder.mkdir(parents=True)
+
+        create_image_folder(
+            test_folder,
+            name="Images",
+            file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
+            num_examples=num_examples,
+        )
+
+        make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
+
+        _make_ann_file(
+            path=root / "GT-final_test.csv",
+            num_examples=num_examples,
+            class_idx="random",
+        )
+
+        make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
+
+    return num_examples
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def clevr(root, config):
+    data_folder = root / "CLEVR_v1.0"
+
+    num_samples_map = {
+        "train": 3,
+        "val": 2,
+        "test": 1,
+    }
+
+    images_folder = data_folder / "images"
+    image_files = {
+        split: create_image_folder(
+            images_folder,
+            split,
+            file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg",
+            num_examples=num_samples,
+        )
+        for split, num_samples in num_samples_map.items()
+    }
+
+    scenes_folder = data_folder / "scenes"
+    scenes_folder.mkdir()
+    for split in ["train", "val"]:
+        with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file:
+            json.dump(
+                {
+                    "scenes": [
+                        {
+                            "image_filename": image_file.name,
+                            # We currently only return the number of objects in a scene.
+                            # Thus, it is sufficient for now to only mock the number of elements.
+                            "objects": [None] * int(torch.randint(1, 5, ())),
+                        }
+                        for image_file in image_files[split]
+                    ]
+                },
+                file,
+            )
+
+    make_zip(root, f"{data_folder.name}.zip", data_folder)
+
+    return num_samples_map[config["split"]]
+
+
+class OxfordIIITPetMockData:
+    @classmethod
+    def _meta_to_split_and_classification_ann(cls, meta, idx):
+        image_id = "_".join(
+            [
+                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
+                str(idx),
+            ]
+        )
+        class_id = str(meta["label"] + 1)
+        species = "1" if meta["species"] == "cat" else "2"
+        breed_id = "-1"
+        return (image_id, class_id, species, breed_id)
+
+    @classmethod
+    def generate(self, root):
+        classification_anns_meta = (
+            dict(cls="Abyssinian", label=0, species="cat"),
+            dict(cls="Keeshond", label=18, species="dog"),
+            dict(cls="Yorkshire Terrier", label=36, species="dog"),
+        )
+        split_and_classification_anns = [
+            self._meta_to_split_and_classification_ann(meta, idx)
+            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
+        ]
+        image_ids, *_ = zip(*split_and_classification_anns)
+
+        image_files = create_image_folder(
+            root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
+        )
+
+        anns_folder = root / "annotations"
+        anns_folder.mkdir()
+        random.shuffle(split_and_classification_anns)
+        splits = ("trainval", "test")
+        num_samples_map = {}
+        for offset, split in enumerate(splits):
+            split_and_classification_anns_in_split = split_and_classification_anns[offset :: len(splits)]
+            with open(anns_folder / f"{split}.txt", "w") as file:
+                writer = csv.writer(file, delimiter=" ")
+                for split_and_classification_ann in split_and_classification_anns_in_split:
+                    writer.writerow(split_and_classification_ann)
+
+            num_samples_map[split] = len(split_and_classification_anns_in_split)
+
+        segmentation_files = create_image_folder(
+            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
+        )
+
+        # The dataset has some rogue files
+        for path in image_files[:3]:
+            path.with_suffix(".mat").touch()
+        for path in segmentation_files:
+            path.with_name(f".{path.name}").touch()
+
+        make_tar(root, "images.tar.gz", compression="gz")
+        make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz")
+
+        return num_samples_map
+
+
+@register_mock(name="oxford-iiit-pet", configs=combinations_grid(split=("trainval", "test")))
+def oxford_iiit_pet(root, config):
+    return OxfordIIITPetMockData.generate(root)[config["split"]]
+
+
+class _CUB200MockData:
+    @classmethod
+    def _category_folder(cls, category, idx):
+        return f"{idx:03d}.{category}"
+
+    @classmethod
+    def _file_stem(cls, category, idx):
+        return f"{category}_{idx:04d}"
+
+    @classmethod
+    def _make_images(cls, images_folder):
+        image_files = []
+        for category_idx, category in [
+            (1, "Black_footed_Albatross"),
+            (100, "Brown_Pelican"),
+            (200, "Common_Yellowthroat"),
+        ]:
+            image_files.extend(
+                create_image_folder(
+                    images_folder,
+                    cls._category_folder(category, category_idx),
+                    lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg",
+                    num_examples=5,
+                )
+            )
+
+        return image_files
+
+
+class CUB2002011MockData(_CUB200MockData):
+    @classmethod
+    def _make_archive(cls, root):
+        archive_folder = root / "CUB_200_2011"
+
+        images_folder = archive_folder / "images"
+        image_files = cls._make_images(images_folder)
+        image_ids = list(range(1, len(image_files) + 1))
+
+        with open(archive_folder / "images.txt", "w") as file:
+            file.write(
+                "\n".join(
+                    f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files)
+                )
+            )
+
+        split_ids = torch.randint(2, (len(image_ids),)).tolist()
+        counts = Counter(split_ids)
+        num_samples_map = {"train": counts[1], "test": counts[0]}
+        with open(archive_folder / "train_test_split.txt", "w") as file:
+            file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids)))
+
+        with open(archive_folder / "bounding_boxes.txt", "w") as file:
+            file.write(
+                "\n".join(
+                    " ".join(
+                        str(item)
+                        for item in [image_id, *make_tensor((4,), dtype=torch.int, low=0).to(torch.float).tolist()]
+                    )
+                    for image_id in image_ids
+                )
+            )
+
+        make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz")
+
+        return image_files, num_samples_map
+
+    @classmethod
+    def _make_segmentations(cls, root, image_files):
+        segmentations_folder = root / "segmentations"
+        for image_file in image_files:
+            folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1]))
+            folder.mkdir(exist_ok=True, parents=True)
+            create_image_file(
+                folder,
+                image_file.with_suffix(".png").name,
+                size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()],
+            )
+
+        make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz")
+
+    @classmethod
+    def generate(cls, root):
+        image_files, num_samples_map = cls._make_archive(root)
+        cls._make_segmentations(root, image_files)
+        return num_samples_map
+
+
+class CUB2002010MockData(_CUB200MockData):
+    @classmethod
+    def _make_hidden_rouge_file(cls, *files):
+        for file in files:
+            (file.parent / f"._{file.name}").touch()
+
+    @classmethod
+    def _make_splits(cls, root, image_files):
+        split_folder = root / "lists"
+        split_folder.mkdir()
+        random.shuffle(image_files)
+        splits = ("train", "test")
+        num_samples_map = {}
+        for offset, split in enumerate(splits):
+            image_files_in_split = image_files[offset :: len(splits)]
+
+            split_file = split_folder / f"{split}.txt"
+            with open(split_file, "w") as file:
+                file.write(
+                    "\n".join(
+                        sorted(
+                            str(image_file.relative_to(image_file.parents[1]).as_posix())
+                            for image_file in image_files_in_split
+                        )
+                    )
+                )
+
+            cls._make_hidden_rouge_file(split_file)
+            num_samples_map[split] = len(image_files_in_split)
+
+        make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz")
+
+        return num_samples_map
+
+    @classmethod
+    def _make_anns(cls, root, image_files):
+        from scipy.io import savemat
+
+        anns_folder = root / "annotations-mat"
+        for image_file in image_files:
+            ann_file = anns_folder / image_file.with_suffix(".mat").relative_to(image_file.parents[1])
+            ann_file.parent.mkdir(parents=True, exist_ok=True)
+
+            savemat(
+                ann_file,
+                {
+                    "seg": torch.randint(
+                        256, make_tensor((2,), low=3, dtype=torch.int).tolist(), dtype=torch.uint8
+                    ).numpy(),
+                    "bbox": dict(
+                        zip(("left", "top", "right", "bottom"), make_tensor((4,), dtype=torch.uint8).tolist())
+                    ),
+                },
+            )
+
+        readme_file = anns_folder / "README.txt"
+        readme_file.touch()
+        cls._make_hidden_rouge_file(readme_file)
+
+        make_tar(root, "annotations.tgz", anns_folder, compression="gz")
+
+    @classmethod
+    def generate(cls, root):
+        images_folder = root / "images"
+        image_files = cls._make_images(images_folder)
+        cls._make_hidden_rouge_file(*image_files)
+        make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz")
+
+        num_samples_map = cls._make_splits(root, image_files)
+        cls._make_anns(root, image_files)
+
+        return num_samples_map
+
+
+@register_mock(configs=combinations_grid(split=("train", "test"), year=("2010", "2011")))
+def cub200(root, config):
+    num_samples_map = (CUB2002011MockData if config["year"] == "2011" else CUB2002010MockData).generate(root)
+    return num_samples_map[config["split"]]
+
+
+@register_mock(configs=[dict()])
+def eurosat(root, config):
+    data_folder = root / "2750"
+    data_folder.mkdir(parents=True)
+
+    num_examples_per_class = 3
+    categories = ["AnnualCrop", "Forest"]
+    for category in categories:
+        create_image_folder(
+            root=data_folder,
+            name=category,
+            file_name_fn=lambda idx: f"{category}_{idx + 1}.jpg",
+            num_examples=num_examples_per_class,
+        )
+    make_zip(root, "EuroSAT.zip", data_folder)
+    return len(categories) * num_examples_per_class
+
+
+@register_mock(configs=combinations_grid(split=("train", "test", "extra")))
+def svhn(root, config):
+    import scipy.io as sio
+
+    num_samples = {
+        "train": 2,
+        "test": 3,
+        "extra": 4,
+    }[config["split"]]
+
+    sio.savemat(
+        root / f"{config['split']}_32x32.mat",
+        {
+            "X": np.random.randint(256, size=(32, 32, 3, num_samples), dtype=np.uint8),
+            "y": np.random.randint(10, size=(num_samples,), dtype=np.uint8),
+        },
+    )
+    return num_samples
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def pcam(root, config):
+    import h5py
+
+    num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
+
+    split = "valid" if config["split"] == "val" else config["split"]
+
+    images_io = io.BytesIO()
+    with h5py.File(images_io, "w") as f:
+        f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
+
+    targets_io = io.BytesIO()
+    with h5py.File(targets_io, "w") as f:
+        f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
+
+    # Create .gz compressed files
+    images_file = root / f"camelyonpatch_level_2_split_{split}_x.h5.gz"
+    targets_file = root / f"camelyonpatch_level_2_split_{split}_y.h5.gz"
+    for compressed_file_name, uncompressed_file_io in ((images_file, images_io), (targets_file, targets_io)):
+        compressed_data = gzip.compress(uncompressed_file_io.getbuffer())
+        with open(compressed_file_name, "wb") as compressed_file:
+            compressed_file.write(compressed_data)
+
+    return num_images
+
+
+@register_mock(name="stanford-cars", configs=combinations_grid(split=("train", "test")))
+def stanford_cars(root, config):
+    import scipy.io as io
+    from numpy.core.records import fromarrays
+
+    split = config["split"]
+    num_samples = {"train": 5, "test": 7}[split]
+    num_categories = 3
+
+    if split == "train":
+        images_folder_name = "cars_train"
+        devkit = root / "devkit"
+        devkit.mkdir()
+        annotations_mat_path = devkit / "cars_train_annos.mat"
+    else:
+        images_folder_name = "cars_test"
+        annotations_mat_path = root / "cars_test_annos_withlabels.mat"
+
+    create_image_folder(
+        root=root,
+        name=images_folder_name,
+        file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
+        num_examples=num_samples,
+    )
+
+    make_tar(root, f"cars_{split}.tgz", images_folder_name)
+    bbox = np.random.randint(1, 200, num_samples, dtype=np.uint8)
+    classes = np.random.randint(1, num_categories + 1, num_samples, dtype=np.uint8)
+    fnames = [f"{i:5d}.jpg" for i in range(num_samples)]
+    rec_array = fromarrays(
+        [bbox, bbox, bbox, bbox, classes, fnames],
+        names=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "class", "fname"],
+    )
+
+    io.savemat(annotations_mat_path, {"annotations": rec_array})
+    if split == "train":
+        make_tar(root, "car_devkit.tgz", devkit, compression="gz")
+
+    return num_samples
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def usps(root, config):
+    num_samples = {"train": 15, "test": 7}[config["split"]]
+
+    with bz2.open(root / f"usps{'.t' if not config['split'] == 'train' else ''}.bz2", "wb") as fh:
+        lines = []
+        for _ in range(num_samples):
+            label = make_tensor(1, low=1, high=11, dtype=torch.int)
+            values = make_tensor(256, low=-1, high=1, dtype=torch.float)
+            lines.append(
+                " ".join([f"{int(label)}", *(f"{idx}:{float(value):.6f}" for idx, value in enumerate(values, 1))])
+            )
+
+        fh.write("\n".join(lines).encode())
+
+    return num_samples
--- a/test/common_extended_utils.py
+++ b/test/common_extended_utils.py
+import os
+from collections import defaultdict
+from numbers import Number
+from typing import Any, List
+
+import torch
+from torch.utils._python_dispatch import TorchDispatchMode
+
+from torch.utils._pytree import tree_map
+
+from torchvision.models._api import Weights
+
+aten = torch.ops.aten
+quantized = torch.ops.quantized
+
+
+def get_shape(i):
+    if isinstance(i, torch.Tensor):
+        return i.shape
+    elif hasattr(i, "weight"):
+        return i.weight().shape
+    else:
+        raise ValueError(f"Unknown type {type(i)}")
+
+
+def prod(x):
+    res = 1
+    for i in x:
+        res *= i
+    return res
+
+
+def matmul_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for matmul.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    input_shapes = [get_shape(v) for v in inputs]
+    assert len(input_shapes) == 2, input_shapes
+    assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    flop = prod(input_shapes[0]) * input_shapes[-1][-1]
+    return flop
+
+
+def addmm_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for fully connected layers.
+    """
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [get_shape(v) for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [batch size, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]
+    assert len(input_shapes[1]) == 2, input_shapes[1]
+    batch_size, input_dim = input_shapes[0]
+    output_dim = input_shapes[1][1]
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def bmm_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the bmm operation.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [get_shape(v) for v in inputs]
+    n, c, t = input_shapes[0]
+    d = input_shapes[-1][-1]
+    flop = n * c * t * d
+    return flop
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Number:
+    """
+    Count flops for convolution. Note only multiplication is
+    counted. Computation for addition and bias is ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flop = batch_size * prod(w_shape) * prod(conv_shape)
+    return flop
+
+
+def conv_flop(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+    transposed = inputs[6]
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+def quant_conv_flop(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for quantized convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=False)
+
+
+def transpose_shape(shape):
+    return [shape[1], shape[0]] + list(shape[2:])
+
+
+def conv_backward_flop(inputs: List[Any], outputs: List[Any]):
+    grad_out_shape, x_shape, w_shape = [get_shape(i) for i in inputs[:3]]
+    output_mask = inputs[-1]
+    fwd_transposed = inputs[7]
+    flop_count = 0
+
+    if output_mask[0]:
+        grad_input_shape = get_shape(outputs[0])
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed)
+    if output_mask[1]:
+        grad_weight_shape = get_shape(outputs[1])
+        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed)
+
+    return flop_count
+
+
+def scaled_dot_product_flash_attention_flop(inputs: List[Any], outputs: List[Any]):
+    # FIXME: this needs to count the flops of this kernel
+    # https://github.com/pytorch/pytorch/blob/207b06d099def9d9476176a1842e88636c1f714f/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp#L52-L267
+    return 0
+
+
+flop_mapping = {
+    aten.mm: matmul_flop,
+    aten.matmul: matmul_flop,
+    aten.addmm: addmm_flop,
+    aten.bmm: bmm_flop,
+    aten.convolution: conv_flop,
+    aten._convolution: conv_flop,
+    aten.convolution_backward: conv_backward_flop,
+    quantized.conv2d: quant_conv_flop,
+    quantized.conv2d_relu: quant_conv_flop,
+    aten._scaled_dot_product_flash_attention: scaled_dot_product_flash_attention_flop,
+}
+
+unmapped_ops = set()
+
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+class FlopCounterMode(TorchDispatchMode):
+    def __init__(self, model=None):
+        self.flop_counts = defaultdict(lambda: defaultdict(int))
+        self.parents = ["Global"]
+        # global mod
+        if model is not None:
+            for name, module in dict(model.named_children()).items():
+                module.register_forward_pre_hook(self.enter_module(name))
+                module.register_forward_hook(self.exit_module(name))
+
+    def enter_module(self, name):
+        def f(module, inputs):
+            self.parents.append(name)
+            inputs = normalize_tuple(inputs)
+            out = self.create_backwards_pop(name)(*inputs)
+            return out
+
+        return f
+
+    def exit_module(self, name):
+        def f(module, inputs, outputs):
+            assert self.parents[-1] == name
+            self.parents.pop()
+            outputs = normalize_tuple(outputs)
+            return self.create_backwards_push(name)(*outputs)
+
+        return f
+
+    def create_backwards_push(self, name):
+        class PushState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                self.parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def create_backwards_pop(self, name):
+        class PopState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                assert self.parents[-1] == name
+                self.parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def __enter__(self):
+        self.flop_counts.clear()
+        super().__enter__()
+
+    def __exit__(self, *args):
+        # print(f"Total: {sum(self.flop_counts['Global'].values()) / 1e9} GFLOPS")
+        # for mod in self.flop_counts.keys():
+        #     print(f"Module: ", mod)
+        #     for k, v in self.flop_counts[mod].items():
+        #         print(f"{k}: {v / 1e9} GFLOPS")
+        #     print()
+        super().__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+
+        out = func(*args, **kwargs)
+        func_packet = func._overloadpacket
+        if func_packet in flop_mapping:
+            flop_count = flop_mapping[func_packet](args, normalize_tuple(out))
+            for par in self.parents:
+                self.flop_counts[par][func_packet] += flop_count
+        else:
+            unmapped_ops.add(func_packet)
+
+        return out
+
+    def get_flops(self):
+        return sum(self.flop_counts["Global"].values()) / 1e9
+
+
+def get_dims(module_name, height, width):
+    # detection models have curated input sizes
+    if module_name == "detection":
+        # we can feed a batch of 1 for detection model instead of a list of 1 image
+        dims = (3, height, width)
+    elif module_name == "video":
+        # hard-coding the time dimension to size 16
+        dims = (1, 16, 3, height, width)
+    else:
+        dims = (1, 3, height, width)
+
+    return dims
+
+
+def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512):
+    module_name = model.__module__.split(".")[-2]
+    dims = get_dims(module_name=module_name, height=height, width=width)
+
+    input_tensor = torch.randn(dims)
+
+    # try:
+    preprocess = weight.transforms()
+    if module_name == "optical_flow":
+        inp = preprocess(input_tensor, input_tensor)
+    else:
+        # hack to enable mod(*inp) for optical_flow models
+        inp = [preprocess(input_tensor)]
+
+    model.eval()
+
+    flop_counter = FlopCounterMode(model)
+    with flop_counter:
+        # detection models expect a list of 3d tensors as inputs
+        if module_name == "detection":
+            model(inp)
+        else:
+            model(*inp)
+
+        flops = flop_counter.get_flops()
+
+    return round(flops, 3)
+
+
+def get_file_size_mb(weight):
+    weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1])
+    weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024
+
+    return round(weights_size_mb, 3)
--- a/test/common_utils.py
+++ b/test/common_utils.py
 import contextlib
 import functools
+import itertools
 import os
+import pathlib
 import random
+import re
 import shutil
+import sys
 import tempfile
+import warnings
+from subprocess import CalledProcessError, check_output, STDOUT

 import numpy as np
+import PIL.Image
+import pytest
 import torch
+import torch.testing
 from PIL import Image
-from torchvision import io

-import __main__  # noqa: 401
+from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
+from torchvision import io, tv_tensors
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_image, to_pil_image


-IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true"
+IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
 IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
 IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
-CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda."
+MPS_NOT_AVAILABLE_MSG = "MPS device not available"
+OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda."


 @contextlib.contextmanager
@@ -107,18 +119,28 @@ def disable_console_output():
        yield


-def cpu_and_gpu():
+def cpu_and_cuda():
    import pytest  # noqa

    return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))


+def cpu_and_cuda_and_mps():
+    return cpu_and_cuda() + (pytest.param("mps", marks=pytest.mark.needs_mps),)
+
+
 def needs_cuda(test_func):
    import pytest  # noqa

    return pytest.mark.needs_cuda(test_func)


+def needs_mps(test_func):
+    import pytest  # noqa
+
+    return pytest.mark.needs_mps(test_func)
+
+
 def _create_data(height=3, width=3, channels=3, device="cpu"):
    # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture
    tensor = torch.randint(0, 256, (channels, height, width), dtype=torch.uint8, device=device)
@@ -137,9 +159,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu
    return batch_tensor


-assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
-
-
 def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
    names = []
    for i in range(num_videos):
@@ -160,6 +179,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):


 def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
+    # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it
    np_pil_image = np.array(pil_image)
    if np_pil_image.ndim == 2:
        np_pil_image = np_pil_image[:, :, None]
@@ -172,6 +192,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
 def _assert_approx_equal_tensor_to_pil(
    tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None
 ):
+    # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it
    # TODO: we could just merge this into _assert_equal_tensor_to_pil
    np_pil_image = np.array(pil_image)
    if np_pil_image.ndim == 2:
@@ -210,7 +231,7 @@ def cache(fn):
    """
    sentinel = object()
    out_cache = {}
-    exc_cache = {}
+    exc_tb_cache = {}

    @functools.wraps(fn)
    def wrapper(*args, **kwargs):
@@ -220,17 +241,280 @@ def cache(fn):
        if out is not sentinel:
            return out

-        exc = exc_cache.get(key, sentinel)
-        if exc is not sentinel:
-            raise exc
+        exc_tb = exc_tb_cache.get(key, sentinel)
+        if exc_tb is not sentinel:
+            raise exc_tb[0].with_traceback(exc_tb[1])

        try:
            out = fn(*args, **kwargs)
        except Exception as exc:
-            exc_cache[key] = exc
+            # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest
+            # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding
+            # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details.
+            exc_tb_cache[key] = exc, exc.__traceback__
            raise exc

        out_cache[key] = out
        return out

    return wrapper
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+class ImagePair(TensorLikePair):
+    def __init__(
+        self,
+        actual,
+        expected,
+        *,
+        mae=False,
+        **other_parameters,
+    ):
+        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
+            actual, expected = [to_image(input) for input in [actual, expected]]
+
+        super().__init__(actual, expected, **other_parameters)
+        self.mae = mae
+
+    def compare(self) -> None:
+        actual, expected = self.actual, self.expected
+
+        self._compare_attributes(actual, expected)
+        actual, expected = self._equalize_attributes(actual, expected)
+
+        if self.mae:
+            if actual.dtype is torch.uint8:
+                actual, expected = actual.to(torch.int), expected.to(torch.int)
+            mae = float(torch.abs(actual - expected).float().mean())
+            if mae > self.atol:
+                self._fail(
+                    AssertionError,
+                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
+                )
+        else:
+            super()._compare_values(actual, expected)
+
+
+def assert_close(
+    actual,
+    expected,
+    *,
+    allow_subclasses=True,
+    rtol=None,
+    atol=None,
+    equal_nan=False,
+    check_device=True,
+    check_dtype=True,
+    check_layout=True,
+    check_stride=False,
+    msg=None,
+    **kwargs,
+):
+    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
+    __tracebackhide__ = True
+
+    error_metas = not_close_error_metas(
+        actual,
+        expected,
+        pair_types=(
+            NonePair,
+            BooleanPair,
+            NumberPair,
+            ImagePair,
+            TensorLikePair,
+        ),
+        allow_subclasses=allow_subclasses,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=check_device,
+        check_dtype=check_dtype,
+        check_layout=check_layout,
+        check_stride=check_stride,
+        **kwargs,
+    )
+
+    if error_metas:
+        raise error_metas[0].to_error(msg)
+
+
+assert_equal = functools.partial(assert_close, rtol=0, atol=0)
+
+
+DEFAULT_SIZE = (17, 11)
+
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return tv_tensors.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()])
+
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    num_objects = 1
+    h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+
+    if format is tv_tensors.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is tv_tensors.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return tv_tensors.BoundingBoxes(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
+    )
+
+
+def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    num_objects = 1
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def assert_run_python_script(source_code):
+    """Utility to check assertions in an independent Python subprocess.
+
+    The script provided in the source code should return 0 and not print
+    anything on stderr or stdout. Modified from scikit-learn test utils.
+
+    Args:
+        source_code (str): The Python source code to execute.
+    """
+    with get_tmp_dir() as root:
+        path = pathlib.Path(root) / "main.py"
+        with open(path, "w") as file:
+            file.write(source_code)
+
+        try:
+            out = check_output([sys.executable, str(path)], stderr=STDOUT)
+        except CalledProcessError as e:
+            raise RuntimeError(f"script errored with output:\n{e.output.decode()}")
+        if out != b"":
+            raise AssertionError(out.decode())
+
+
+@contextlib.contextmanager
+def assert_no_warnings():
+    # The name `catch_warnings` is a misnomer as the context manager does **not** catch any warnings, but rather scopes
+    # the warning filters. All changes that are made to the filters while in this context, will be reset upon exit.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        yield
+
+
+@contextlib.contextmanager
+def ignore_jit_no_profile_information_warning():
+    # Calling a scripted object often triggers a warning like
+    # `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information`
+    # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
+    # them.
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning)
+        yield
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,12 +3,21 @@ import random
 import numpy as np
 import pytest
 import torch
-from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER
+
+from common_utils import (
+    CUDA_NOT_AVAILABLE_MSG,
+    IN_FBCODE,
+    IN_OSS_CI,
+    IN_RE_WORKER,
+    MPS_NOT_AVAILABLE_MSG,
+    OSS_CI_GPU_NO_CUDA_MSG,
+)


 def pytest_configure(config):
    # register an additional marker (see pytest_collection_modifyitems)
    config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device")
+    config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device")
    config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected")


@@ -16,9 +25,9 @@ def pytest_collection_modifyitems(items):
    # This hook is called by pytest after it has collected the tests (google its name to check out its doc!)
    # We can ignore some tests as we see fit here, or add marks, such as a skip mark.
    #
-    # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
+    # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
    # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already.
-    # This is true for both CircleCI and the fbcode internal CI.
+    # This is true for both OSS CI and the fbcode internal CI.
    # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on
    # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if
    # these tests never existed.
@@ -28,16 +37,20 @@ def pytest_collection_modifyitems(items):
        # The needs_cuda mark will exist if the test was explicitly decorated with
        # the @needs_cuda decorator. It will also exist if it was parametrized with a
        # parameter that has the mark: for example if a test is parametrized with
-        # @pytest.mark.parametrize('device', cpu_and_gpu())
+        # @pytest.mark.parametrize('device', cpu_and_cuda())
        # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
        # and the ones with device == 'cpu' won't have the mark.
        needs_cuda = item.get_closest_marker("needs_cuda") is not None
+        needs_mps = item.get_closest_marker("needs_mps") is not None

        if needs_cuda and not torch.cuda.is_available():
            # In general, we skip cuda tests on machines without a GPU
            # There are special cases though, see below
            item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG))

+        if needs_mps and not torch.backends.mps.is_available():
+            item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG))
+
        if IN_FBCODE:
            # fbcode doesn't like skipping tests, so instead we  just don't collect the test
            # so that they don't even "exist", hence the continue statements.
@@ -49,15 +62,18 @@ def pytest_collection_modifyitems(items):
                # TODO: something more robust would be to do that only in a sandcastle instance,
                # so that we can still see the test being skipped when testing locally from a devvm
                continue
-        elif IN_CIRCLE_CI:
+            if needs_mps and not torch.backends.mps.is_available():
+                # Same as above, but for MPS
+                continue
+        elif IN_OSS_CI:
            # Here we're not in fbcode, so we can safely collect and skip tests.
            if not needs_cuda and torch.cuda.is_available():
-                # Similar to what happens in RE workers: we don't need the CircleCI GPU machines
+                # Similar to what happens in RE workers: we don't need the OSS CI GPU machines
                # to run the CPU-only tests.
-                item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG))
+                item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG))

        if item.get_closest_marker("dont_collect") is not None:
-            # currently, this is only used for some tests we're sure we dont want to run on fbcode
+            # currently, this is only used for some tests we're sure we don't want to run on fbcode
            continue

        out_items.append(item)

--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -5,6 +5,7 @@ import inspect
 import itertools
 import os
 import pathlib
+import platform
 import random
 import shutil
 import string
@@ -25,6 +26,7 @@ import torch
 import torchvision.datasets
 import torchvision.io
 from common_utils import disable_console_output, get_tmp_dir
+from torch.utils._pytree import tree_any
 from torchvision.transforms.functional import get_dimensions


@@ -137,7 +139,7 @@ def test_all_configs(test):

    .. note::

-        This will try to remove duplicate configurations. During this process it will not not preserve a potential
+        This will try to remove duplicate configurations. During this process it will not preserve a potential
        ordering of the configurations or an inner ordering of a configuration.
    """

@@ -146,7 +148,7 @@ def test_all_configs(test):
            return [dict(config_) for config_ in {tuple(sorted(config.items())) for config in configs}]
        except TypeError:
            # A TypeError will be raised if a value of any config is not hashable, e.g. a list. In that case duplicate
-            # removal would be a lot more elaborate and we simply bail out.
+            # removal would be a lot more elaborate, and we simply bail out.
            return configs

    @functools.wraps(test)
@@ -169,23 +171,6 @@ def test_all_configs(test):
    return wrapper


-def combinations_grid(**kwargs):
-    """Creates a grid of input combinations.
-
-    Each element in the returned sequence is a dictionary containing one possible combination as values.
-
-    Example:
-        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
-        [
-            {'foo': 'bar', 'spam': 'eggs'},
-            {'foo': 'bar', 'spam': 'ham'},
-            {'foo': 'baz', 'spam': 'eggs'},
-            {'foo': 'baz', 'spam': 'ham'}
-        ]
-    """
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
 class DatasetTestCase(unittest.TestCase):
    """Abstract base class for all dataset testcases.

@@ -297,7 +282,7 @@ class DatasetTestCase(unittest.TestCase):
        .. note::

            The default behavior is only valid if the dataset to be tested has ``root`` as the only required parameter.
-            Otherwise you need to overwrite this method.
+            Otherwise, you need to overwrite this method.

        Args:
            tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset
@@ -564,7 +549,7 @@ class DatasetTestCase(unittest.TestCase):
    @test_all_configs
    def test_num_examples(self, config):
        with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"]
+            assert len(list(dataset)) == len(dataset) == info["num_examples"]

    @test_all_configs
    def test_transforms(self, config):
@@ -581,6 +566,42 @@ class DatasetTestCase(unittest.TestCase):

                mock.assert_called()

+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        from torchvision import tv_tensors
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        try:
+            with self.create_dataset(config) as (dataset, info):
+                for target_keys in [None, "all"]:
+                    if target_keys is not None and self.DATASET_CLASS not in {
+                        torchvision.datasets.CocoDetection,
+                        torchvision.datasets.VOCDetection,
+                        torchvision.datasets.Kitti,
+                        torchvision.datasets.WIDERFace,
+                    }:
+                        with self.assertRaisesRegex(ValueError, "`target_keys` is currently only supported for"):
+                            wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                        continue
+
+                    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                    assert isinstance(wrapped_dataset, self.DATASET_CLASS)
+                    assert len(wrapped_dataset) == info["num_examples"]
+
+                    wrapped_sample = wrapped_dataset[0]
+                    assert tree_any(
+                        lambda item: isinstance(item, (tv_tensors.TVTensor, PIL.Image.Image)), wrapped_sample
+                    )
+        except TypeError as error:
+            msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
+            if str(error).startswith(msg):
+                pytest.skip(msg)
+            raise error
+        except RuntimeError as error:
+            if "currently not supported by this wrapper" in str(error):
+                pytest.skip("Config is currently not supported by this wrapper")
+            raise error
+

 class ImageDatasetTestCase(DatasetTestCase):
    """Abstract base class for image dataset testcases.
@@ -604,7 +625,7 @@ class ImageDatasetTestCase(DatasetTestCase):
            patch_checks=patch_checks,
            **kwargs,
        ) as (dataset, info):
-            # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access
+            # PIL.Image.open() only loads the image metadata upfront and keeps the file open until the first access
            # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we
            # force-load opened images.
            # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an
@@ -641,27 +662,73 @@ class VideoDatasetTestCase(DatasetTestCase):
    FEATURE_TYPES = (torch.Tensor, torch.Tensor, int)
    REQUIRED_PACKAGES = ("av",)

-    DEFAULT_FRAMES_PER_CLIP = 1
+    FRAMES_PER_CLIP = 1

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dataset_args = self._set_default_frames_per_clip(self.dataset_args)

-    def _set_default_frames_per_clip(self, inject_fake_data):
+    def _set_default_frames_per_clip(self, dataset_args):
        argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__)
        args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
        frames_per_clip_last = args_without_default[-1] == "frames_per_clip"

-        @functools.wraps(inject_fake_data)
+        @functools.wraps(dataset_args)
        def wrapper(tmpdir, config):
-            args = inject_fake_data(tmpdir, config)
+            args = dataset_args(tmpdir, config)
            if frames_per_clip_last and len(args) == len(args_without_default) - 1:
-                args = (*args, self.DEFAULT_FRAMES_PER_CLIP)
+                args = (*args, self.FRAMES_PER_CLIP)

            return args

        return wrapper

+    def test_output_format(self):
+        for output_format in ["TCHW", "THWC"]:
+            with self.create_dataset(output_format=output_format) as (dataset, _):
+                for video, *_ in dataset:
+                    if output_format == "TCHW":
+                        num_frames, num_channels, *_ = video.shape
+                    else:  # output_format == "THWC":
+                        num_frames, *_, num_channels = video.shape
+
+                assert num_frames == self.FRAMES_PER_CLIP
+                assert num_channels == 3
+
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly
+        # or use the supported `"TCHW"`
+        if config.setdefault("output_format", "TCHW") == "THWC":
+            return
+
+        super().test_transforms_v2_wrapper.__wrapped__(self, config)
+
+
+def _no_collate(batch):
+    return batch
+
+
+def check_transforms_v2_wrapper_spawn(dataset):
+    # On Linux and Windows, the DataLoader forks the main process by default. This is not available on macOS, so new
+    # subprocesses are spawned. This requires the whole pipeline including the dataset to be pickleable, which is what
+    # we are enforcing here.
+    if platform.system() != "Darwin":
+        pytest.skip("Multiprocessing spawning is only checked on macOS.")
+
+    from torch.utils.data import DataLoader
+    from torchvision import tv_tensors
+    from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    dataloader = DataLoader(wrapped_dataset, num_workers=2, multiprocessing_context="spawn", collate_fn=_no_collate)
+
+    for wrapped_sample in dataloader:
+        assert tree_any(
+            lambda item: isinstance(item, (tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)), wrapped_sample
+        )
+

 def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor:
    r"""Create a random uint8 tensor.
@@ -786,7 +853,7 @@ def create_video_file(
    fps: float = 25,
    **kwargs: Any,
 ) -> pathlib.Path:
-    """Create an video file from random data.
+    """Create a video file from random data.

    Args:
        root (Union[str, pathlib.Path]): Root directory the video file will be placed in.
@@ -951,7 +1018,7 @@ def create_random_string(length: int, *digits: str) -> str:

    Args:
        length (int): Number of characters in the generated string.
-        *characters (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
+        *digits (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
    """
    if not digits:
        digits = string.ascii_lowercase