Split tests for transforms v2 and prototype (#7278)

4774fe3a · Philip Meier · GitHub · ac1512b6 · 4774fe3a · 4774fe3a
Unverified Commit 4774fe3a authored Feb 17, 2023 by Philip Meier Committed by GitHub Feb 17, 2023
15 changed files
--- a/.circleci/unittest/linux/scripts/run_test.sh
+++ b/.circleci/unittest/linux/scripts/run_test.sh
@@ -6,4 +6,17 @@ eval "$(./conda/bin/conda shell.bash hook)"
 conda activate ./env

 python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
+
+case "$(uname -s)" in
+  Darwin*)
+    # The largest macOS runner is not able to handle the regular test suite plus the transforms v2 tests at the same
+    # time due to insufficient resources. Thus, we ignore the transforms v2 tests at first and run them in a separate
+    # step afterwards.
+    GLOB='test/test_transforms_v2*'
+    pytest --junitxml=test-results/junit.xml -v --durations 20 --ignore-glob="${GLOB}"
+    eval "pytest --junitxml=test-results/junit-transforms-v2.xml -v --durations 20 ${GLOB}"
+    ;;
+  *)
+    pytest --junitxml=test-results/junit.xml -v --durations 20
+    ;;
+esac
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -18,7 +18,8 @@ from collections import Counter, defaultdict
 import numpy as np
 import pytest
 import torch
-from datasets_utils import combinations_grid, create_image_file, create_image_folder, make_tar, make_zip
+from common_utils import combinations_grid
+from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
 from torchvision.prototype import datasets

--- a/test/common_utils.py
+++ b/test/common_utils.py
+import collections.abc
 import contextlib
+import dataclasses
+import enum
 import functools
+import itertools
 import os
+import pathlib
 import random
 import shutil
 import tempfile
+from collections import defaultdict
+from typing import Callable, Sequence, Tuple, Union

 import numpy as np
+
+import PIL.Image
+import pytest
 import torch
+import torch.testing
 from PIL import Image
-from torchvision import io

-import __main__  # noqa: 401
+from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
+from torchvision import datapoints, io
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor


 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -137,9 +150,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu
    return batch_tensor


-assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
-
-
 def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
    names = []
    for i in range(num_videos):
@@ -160,6 +170,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):


 def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
+    # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it
    np_pil_image = np.array(pil_image)
    if np_pil_image.ndim == 2:
        np_pil_image = np_pil_image[:, :, None]
@@ -172,6 +183,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
 def _assert_approx_equal_tensor_to_pil(
    tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None
 ):
+    # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it
    # TODO: we could just merge this into _assert_equal_tensor_to_pil
    np_pil_image = np.array(pil_image)
    if np_pil_image.ndim == 2:
@@ -237,3 +249,592 @@ def cache(fn):
        return out

    return wrapper
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+class ImagePair(TensorLikePair):
+    def __init__(
+        self,
+        actual,
+        expected,
+        *,
+        mae=False,
+        **other_parameters,
+    ):
+        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
+            actual, expected = [to_image_tensor(input) for input in [actual, expected]]
+
+        super().__init__(actual, expected, **other_parameters)
+        self.mae = mae
+
+    def compare(self) -> None:
+        actual, expected = self.actual, self.expected
+
+        self._compare_attributes(actual, expected)
+        actual, expected = self._equalize_attributes(actual, expected)
+
+        if self.mae:
+            actual, expected = self._promote_for_comparison(actual, expected)
+            mae = float(torch.abs(actual - expected).float().mean())
+            if mae > self.atol:
+                self._fail(
+                    AssertionError,
+                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
+                )
+        else:
+            super()._compare_values(actual, expected)
+
+
+def assert_close(
+    actual,
+    expected,
+    *,
+    allow_subclasses=True,
+    rtol=None,
+    atol=None,
+    equal_nan=False,
+    check_device=True,
+    check_dtype=True,
+    check_layout=True,
+    check_stride=False,
+    msg=None,
+    **kwargs,
+):
+    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
+    __tracebackhide__ = True
+
+    error_metas = not_close_error_metas(
+        actual,
+        expected,
+        pair_types=(
+            NonePair,
+            BooleanPair,
+            NumberPair,
+            ImagePair,
+            TensorLikePair,
+        ),
+        allow_subclasses=allow_subclasses,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=check_device,
+        check_dtype=check_dtype,
+        check_layout=check_layout,
+        check_stride=check_stride,
+        **kwargs,
+    )
+
+    if error_metas:
+        raise error_metas[0].to_error(msg)
+
+
+assert_equal = functools.partial(assert_close, rtol=0, atol=0)
+
+
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 10:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper
+
+
+class ArgsKwargs:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __iter__(self):
+        yield self.args
+        yield self.kwargs
+
+    def load(self, device="cpu"):
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
+
+
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    "random",
+)
+
+
+def _parse_spatial_size(size, *, name="size"):
+    if size == "random":
+        return tuple(torch.randint(15, 33, (2,)).tolist())
+    elif isinstance(size, int) and size > 0:
+        return (size, size)
+    elif (
+        isinstance(size, collections.abc.Sequence)
+        and len(size) == 2
+        and all(isinstance(length, int) and length > 0 for length in size)
+    ):
+        return tuple(size)
+    else:
+        raise pytest.UsageError(
+            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
+            f"but got {size} instead."
+        )
+
+
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
+
+
+def from_loader(loader_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loader = loader_fn(*args, **kwargs)
+        return loader.load(device)
+
+    return wrapper
+
+
+def from_loaders(loaders_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loaders = loaders_fn(*args, **kwargs)
+        for loader in loaders:
+            yield loader.load(device)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class TensorLoader:
+    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
+    shape: Sequence[int]
+    dtype: torch.dtype
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device)
+
+
+@dataclasses.dataclass
+class ImageLoader(TensorLoader):
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    num_channels: int = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.spatial_size = self.shape[-2:]
+        self.num_channels = self.shape[-3]
+
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
+def make_image_loader(
+    size="random",
+    *,
+    color_space="RGB",
+    extra_dims=(),
+    dtype=torch.float32,
+    constant_alpha=True,
+):
+    size = _parse_spatial_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device):
+        max_value = get_max_value(dtype)
+        data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
+        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
+            data[..., -1, :, :] = max_value
+        return datapoints.Image(data)
+
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype)
+
+
+make_image = from_loader(make_image_loader)
+
+
+def make_image_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
+    ),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.float32, torch.float64, torch.uint8),
+    constant_alpha=True,
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
+        yield make_image_loader(**params, constant_alpha=constant_alpha)
+
+
+make_images = from_loaders(make_image_loaders)
+
+
+def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dtype=torch.uint8):
+    size = _parse_spatial_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
+
+        return datapoints.Image(image_tensor)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=("RGB",),
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
+        yield make_image_loader_for_interpolation(**params)
+
+
+@dataclasses.dataclass
+class BoundingBoxLoader(TensorLoader):
+    format: datapoints.BoundingBoxFormat
+    spatial_size: Tuple[int, int]
+
+
+def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
+    low, high = torch.broadcast_tensors(
+        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
+    )
+    return torch.stack(
+        [
+            torch.randint(low_scalar, high_scalar, (), **kwargs)
+            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
+        ]
+    ).reshape(low.shape)
+
+
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+    if format not in {
+        datapoints.BoundingBoxFormat.XYXY,
+        datapoints.BoundingBoxFormat.XYWH,
+        datapoints.BoundingBoxFormat.CXCYWH,
+    }:
+        raise pytest.UsageError(f"Can't make bounding box in format {format}")
+
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
+
+    def fn(shape, dtype, device):
+        *extra_dims, num_coordinates = shape
+        if num_coordinates != 4:
+            raise pytest.UsageError()
+
+        if any(dim == 0 for dim in extra_dims):
+            return datapoints.BoundingBox(
+                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+            )
+
+        height, width = spatial_size
+
+        if format == datapoints.BoundingBoxFormat.XYXY:
+            x1 = torch.randint(0, width // 2, extra_dims)
+            y1 = torch.randint(0, height // 2, extra_dims)
+            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
+            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
+            parts = (x1, y1, x2, y2)
+        elif format == datapoints.BoundingBoxFormat.XYWH:
+            x = torch.randint(0, width // 2, extra_dims)
+            y = torch.randint(0, height // 2, extra_dims)
+            w = randint_with_tensor_bounds(1, width - x)
+            h = randint_with_tensor_bounds(1, height - y)
+            parts = (x, y, w, h)
+        else:  # format == features.BoundingBoxFormat.CXCYWH:
+            cx = torch.randint(1, width - 1, extra_dims)
+            cy = torch.randint(1, height - 1, extra_dims)
+            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
+            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
+            parts = (cx, cy, w, h)
+
+        return datapoints.BoundingBox(
+            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        )
+
+    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
+
+
+make_bounding_box = from_loader(make_bounding_box_loader)
+
+
+def make_bounding_box_loaders(
+    *,
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    formats=tuple(datapoints.BoundingBoxFormat),
+    spatial_size="random",
+    dtypes=(torch.float32, torch.float64, torch.int64),
+):
+    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+
+
+make_bounding_boxes = from_loaders(make_bounding_box_loaders)
+
+
+class MaskLoader(TensorLoader):
+    pass
+
+
+def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
+    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
+    size = _parse_spatial_size(size)
+    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
+
+    def fn(shape, dtype, device):
+        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
+        return datapoints.Mask(data)
+
+    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
+
+
+make_detection_mask = from_loader(make_detection_mask_loader)
+
+
+def make_detection_mask_loaders(
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
+        yield make_detection_mask_loader(**params)
+
+
+make_detection_masks = from_loaders(make_detection_mask_loaders)
+
+
+def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    size = _parse_spatial_size(size)
+    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
+
+    def fn(shape, dtype, device):
+        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
+        return datapoints.Mask(data)
+
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+
+
+make_segmentation_mask = from_loader(make_segmentation_mask_loader)
+
+
+def make_segmentation_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_categories=(1, 2, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_segmentation_mask_loader(**params)
+
+
+make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
+
+
+def make_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, "random"),
+    num_categories=(1, 2, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
+    yield from make_segmentation_mask_loaders(
+        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
+    )
+
+
+make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size="random",
+    *,
+    color_space="RGB",
+    num_frames="random",
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_spatial_size(size)
+    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
+
+    def fn(shape, dtype, device):
+        video = make_image(size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device)
+        return datapoints.Video(video)
+
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
+
+
+make_video = from_loader(make_video_loader)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "RGB",
+    ),
+    num_frames=(1, 0, "random"),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8, torch.float32, torch.float64),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason, condition=None):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
+
+
+class InfoBase:
+    def __init__(
+        self,
+        *,
+        # Identifier if the info that shows up the parametrization.
+        id,
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
+
+        self.test_marks = test_marks or []
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -170,23 +170,6 @@ def test_all_configs(test):
    return wrapper


-def combinations_grid(**kwargs):
-    """Creates a grid of input combinations.
-
-    Each element in the returned sequence is a dictionary containing one possible combination as values.
-
-    Example:
-        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
-        [
-            {'foo': 'bar', 'spam': 'eggs'},
-            {'foo': 'bar', 'spam': 'ham'},
-            {'foo': 'baz', 'spam': 'eggs'},
-            {'foo': 'baz', 'spam': 'ham'}
-        ]
-    """
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
 class DatasetTestCase(unittest.TestCase):
    """Abstract base class for all dataset testcases.


--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
-"""This module is separated from common_utils.py to prevent the former to be dependent on torchvision.prototype"""
-
 import collections.abc
 import dataclasses
-import enum
-import functools
-import pathlib
-from collections import defaultdict
-from typing import Callable, Optional, Sequence, Tuple, Union
+from typing import Optional, Sequence

-import PIL.Image
 import pytest
 import torch
-import torch.testing
-import torchvision.prototype.datapoints as proto_datapoints
-from datasets_utils import combinations_grid
-from torch.nn.functional import one_hot
-from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
-from torchvision import datapoints
-from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
-
-__all__ = [
-    "assert_close",
-    "assert_equal",
-    "ArgsKwargs",
-    "VALID_EXTRA_DIMS",
-    "make_image_loaders",
-    "make_image",
-    "make_images",
-    "make_bounding_box_loaders",
-    "make_bounding_box",
-    "make_bounding_boxes",
-    "make_label",
-    "make_one_hot_labels",
-    "make_detection_mask_loaders",
-    "make_detection_mask",
-    "make_detection_masks",
-    "make_segmentation_mask_loaders",
-    "make_segmentation_mask",
-    "make_segmentation_masks",
-    "make_mask_loaders",
-    "make_masks",
-    "make_video",
-    "make_videos",
-    "TestMark",
-    "mark_framework_limitation",
-    "InfoBase",
-]
-
-
-class ImagePair(TensorLikePair):
-    def __init__(
-        self,
-        actual,
-        expected,
-        *,
-        mae=False,
-        **other_parameters,
-    ):
-        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
-            actual, expected = [to_image_tensor(input) for input in [actual, expected]]
-
-        super().__init__(actual, expected, **other_parameters)
-        self.mae = mae
-
-    def compare(self) -> None:
-        actual, expected = self.actual, self.expected
-
-        self._compare_attributes(actual, expected)
-        actual, expected = self._equalize_attributes(actual, expected)
-
-        if self.mae:
-            actual, expected = self._promote_for_comparison(actual, expected)
-            mae = float(torch.abs(actual - expected).float().mean())
-            if mae > self.atol:
-                self._fail(
-                    AssertionError,
-                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
-                )
-        else:
-            super()._compare_values(actual, expected)
-
-
-def assert_close(
-    actual,
-    expected,
-    *,
-    allow_subclasses=True,
-    rtol=None,
-    atol=None,
-    equal_nan=False,
-    check_device=True,
-    check_dtype=True,
-    check_layout=True,
-    check_stride=False,
-    msg=None,
-    **kwargs,
-):
-    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
-    __tracebackhide__ = True
-
-    error_metas = not_close_error_metas(
-        actual,
-        expected,
-        pair_types=(
-            NonePair,
-            BooleanPair,
-            NumberPair,
-            ImagePair,
-            TensorLikePair,
-        ),
-        allow_subclasses=allow_subclasses,
-        rtol=rtol,
-        atol=atol,
-        equal_nan=equal_nan,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride,
-        **kwargs,
-    )
-
-    if error_metas:
-        raise error_metas[0].to_error(msg)
-
-
-assert_equal = functools.partial(assert_close, rtol=0, atol=0)
-
-
-def parametrized_error_message(*args, **kwargs):
-    def to_str(obj):
-        if isinstance(obj, torch.Tensor) and obj.numel() > 10:
-            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
-        elif isinstance(obj, enum.Enum):
-            return f"{type(obj).__name__}.{obj.name}"
-        else:
-            return repr(obj)
-
-    if args or kwargs:
-        postfix = "\n".join(
-            [
-                "",
-                "Failure happened for the following parameters:",
-                "",
-                *[to_str(arg) for arg in args],
-                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
-            ]
-        )
-    else:
-        postfix = ""
-
-    def wrapper(msg):
-        return msg + postfix
-
-    return wrapper
-
-
-class ArgsKwargs:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-    def __iter__(self):
-        yield self.args
-        yield self.kwargs
-
-    def load(self, device="cpu"):
-        return ArgsKwargs(
-            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
-            **{
-                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
-                for keyword, arg in self.kwargs.items()
-            },
-        )
-
-
-DEFAULT_SQUARE_SPATIAL_SIZE = 15
-DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
-DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
-DEFAULT_SPATIAL_SIZES = (
-    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-    "random",
-)
-
-
-def _parse_spatial_size(size, *, name="size"):
-    if size == "random":
-        return tuple(torch.randint(15, 33, (2,)).tolist())
-    elif isinstance(size, int) and size > 0:
-        return (size, size)
-    elif (
-        isinstance(size, collections.abc.Sequence)
-        and len(size) == 2
-        and all(isinstance(length, int) and length > 0 for length in size)
-    ):
-        return tuple(size)
-    else:
-        raise pytest.UsageError(
-            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
-            f"but got {size} instead."
-        )
-
-
-VALID_EXTRA_DIMS = ((), (4,), (2, 3))
-DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
-
-DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
-
-
-def from_loader(loader_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loader = loader_fn(*args, **kwargs)
-        return loader.load(device)
-
-    return wrapper
-
-
-def from_loaders(loaders_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loaders = loaders_fn(*args, **kwargs)
-        for loader in loaders:
-            yield loader.load(device)
-
-    return wrapper
-
-
-@dataclasses.dataclass
-class TensorLoader:
-    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
-    shape: Sequence[int]
-    dtype: torch.dtype
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device)
-
-
-@dataclasses.dataclass
-class ImageLoader(TensorLoader):
-    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
-    num_channels: int = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.spatial_size = self.shape[-2:]
-        self.num_channels = self.shape[-3]
-
-
-NUM_CHANNELS_MAP = {
-    "GRAY": 1,
-    "GRAY_ALPHA": 2,
-    "RGB": 3,
-    "RGBA": 4,
-}
-
-
-def get_num_channels(color_space):
-    num_channels = NUM_CHANNELS_MAP.get(color_space)
-    if not num_channels:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
-    return num_channels
-
-
-def make_image_loader(
-    size="random",
-    *,
-    color_space="RGB",
-    extra_dims=(),
-    dtype=torch.float32,
-    constant_alpha=True,
-):
-    size = _parse_spatial_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device):
-        max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
-        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
-            data[..., -1, :, :] = max_value
-        return datapoints.Image(data)
-
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype)
-
-
-make_image = from_loader(make_image_loader)
-
-
-def make_image_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "GRAY_ALPHA",
-        "RGB",
-        "RGBA",
-    ),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.float64, torch.uint8),
-    constant_alpha=True,
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
-        yield make_image_loader(**params, constant_alpha=constant_alpha)
-
-
-make_images = from_loaders(make_image_loaders)
-
-
-def make_image_loader_for_interpolation(size="random", *, color_space="RGB", dtype=torch.uint8):
-    size = _parse_spatial_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device):
-        height, width = shape[-2:]
-
-        image_pil = (
-            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
-            .resize((width, height))
-            .convert(
-                {
-                    "GRAY": "L",
-                    "GRAY_ALPHA": "LA",
-                    "RGB": "RGB",
-                    "RGBA": "RGBA",
-                }[color_space]
-            )
-        )
-
-        image_tensor = convert_dtype_image_tensor(to_image_tensor(image_pil).to(device=device), dtype=dtype)
-
-        return datapoints.Image(image_tensor)
-
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype)
-
-
-def make_image_loaders_for_interpolation(
-    sizes=((233, 147),),
-    color_spaces=("RGB",),
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes):
-        yield make_image_loader_for_interpolation(**params)
-
-
-@dataclasses.dataclass
-class BoundingBoxLoader(TensorLoader):
-    format: datapoints.BoundingBoxFormat
-    spatial_size: Tuple[int, int]
-
-
-def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
-    low, high = torch.broadcast_tensors(
-        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
-    )
-    return torch.stack(
-        [
-            torch.randint(low_scalar, high_scalar, (), **kwargs)
-            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
-        ]
-    ).reshape(low.shape)
-
-
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
-    if format not in {
-        datapoints.BoundingBoxFormat.XYXY,
-        datapoints.BoundingBoxFormat.XYWH,
-        datapoints.BoundingBoxFormat.CXCYWH,
-    }:
-        raise pytest.UsageError(f"Can't make bounding box in format {format}")
-
-    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
-
-    def fn(shape, dtype, device):
-        *extra_dims, num_coordinates = shape
-        if num_coordinates != 4:
-            raise pytest.UsageError()
-
-        if any(dim == 0 for dim in extra_dims):
-            return datapoints.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
-            )
-
-        height, width = spatial_size
-
-        if format == datapoints.BoundingBoxFormat.XYXY:
-            x1 = torch.randint(0, width // 2, extra_dims)
-            y1 = torch.randint(0, height // 2, extra_dims)
-            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-            parts = (x1, y1, x2, y2)
-        elif format == datapoints.BoundingBoxFormat.XYWH:
-            x = torch.randint(0, width // 2, extra_dims)
-            y = torch.randint(0, height // 2, extra_dims)
-            w = randint_with_tensor_bounds(1, width - x)
-            h = randint_with_tensor_bounds(1, height - y)
-            parts = (x, y, w, h)
-        else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, extra_dims)
-            cy = torch.randint(1, height - 1, extra_dims)
-            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-            parts = (cx, cy, w, h)
-
-        return datapoints.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
-        )
-
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
-
-
-make_bounding_box = from_loader(make_bounding_box_loader)
-
-
-def make_bounding_box_loaders(
-    *,
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size="random",
-    dtypes=(torch.float32, torch.float64, torch.int64),
-):
-    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, spatial_size=spatial_size)

+from common_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
+from torch.nn.functional import one_hot

-make_bounding_boxes = from_loaders(make_bounding_box_loaders)
+from torchvision.prototype import datapoints


 @dataclasses.dataclass
@@ -458,7 +40,7 @@ def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
        # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
        # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
-        return proto_datapoints.Label(data, categories=categories)
+        return datapoints.Label(data, categories=categories)

    return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)

@@ -482,7 +64,7 @@ def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int
            # since `one_hot` only supports int64
            label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
            data = one_hot(label, num_classes=num_categories).to(dtype)
-        return proto_datapoints.OneHotLabel(data, categories=categories)
+        return datapoints.OneHotLabel(data, categories=categories)

    return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)

@@ -498,195 +80,3 @@ def make_one_hot_label_loaders(


 make_one_hot_labels = from_loaders(make_one_hot_label_loaders)
-
-
-class MaskLoader(TensorLoader):
-    pass
-
-
-def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_spatial_size(size)
-    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
-
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return datapoints.Mask(data)
-
-    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
-
-
-make_detection_mask = from_loader(make_detection_mask_loader)
-
-
-def make_detection_mask_loaders(
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
-        yield make_detection_mask_loader(**params)
-
-
-make_detection_masks = from_loaders(make_detection_mask_loaders)
-
-
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_spatial_size(size)
-    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
-
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return datapoints.Mask(data)
-
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
-
-
-make_segmentation_mask = from_loader(make_segmentation_mask_loader)
-
-
-def make_segmentation_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_segmentation_mask_loader(**params)
-
-
-make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
-
-
-def make_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
-    num_categories=(1, 2, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
-    yield from make_segmentation_mask_loaders(
-        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
-    )
-
-
-make_masks = from_loaders(make_mask_loaders)
-
-
-class VideoLoader(ImageLoader):
-    pass
-
-
-def make_video_loader(
-    size="random",
-    *,
-    color_space="RGB",
-    num_frames="random",
-    extra_dims=(),
-    dtype=torch.uint8,
-):
-    size = _parse_spatial_size(size)
-    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
-
-    def fn(shape, dtype, device):
-        video = make_image(size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device)
-        return datapoints.Video(video)
-
-    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
-
-
-make_video = from_loader(make_video_loader)
-
-
-def make_video_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "RGB",
-    ),
-    num_frames=(1, 0, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8, torch.float32, torch.float64),
-):
-    for params in combinations_grid(
-        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
-    ):
-        yield make_video_loader(**params)
-
-
-make_videos = from_loaders(make_video_loaders)
-
-
-class TestMark:
-    def __init__(
-        self,
-        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
-        # no test class, i.e. a standalone test function, use `None`.
-        test_id,
-        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
-        mark,
-        *,
-        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
-        # applied. If omitted, defaults to always apply.
-        condition=None,
-    ):
-        self.test_id = test_id
-        self.mark = mark
-        self.condition = condition or (lambda args_kwargs: True)
-
-
-def mark_framework_limitation(test_id, reason, condition=None):
-    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
-    # framework cannot handle the kernel in general or a specific parameter combination.
-    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
-    # still justified.
-    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
-    # we are wasting CI resources for no reason for most of the time
-    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
-
-
-class InfoBase:
-    def __init__(
-        self,
-        *,
-        # Identifier if the info that shows up the parametrization.
-        id,
-        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
-        # See the `TestMark` class for details
-        test_marks=None,
-        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
-        # `TestMark`), the dtype, and the device.
-        closeness_kwargs=None,
-    ):
-        self.id = id
-
-        self.test_marks = test_marks or []
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-        self.closeness_kwargs = closeness_kwargs or dict()
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
-
-    def get_closeness_kwargs(self, test_id, *, dtype, device):
-        if not (isinstance(test_id, tuple) and len(test_id) == 2):
-            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
-            if callable(test_id):
-                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
-            else:
-                msg += f", but got {test_id} instead."
-            raise pytest.UsageError(msg)
-        if isinstance(device, torch.device):
-            device = device.type
-        return self.closeness_kwargs.get((test_id, dtype, device), dict())
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
+import pytest
+import torch
+from PIL import Image
+
+from torchvision import datapoints
+
+
+@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
+def test_image_instance(data):
+    image = datapoints.Image(data)
+    assert isinstance(image, torch.Tensor)
+    assert image.ndim == 3 and image.shape[0] == 3
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
+def test_mask_instance(data):
+    mask = datapoints.Mask(data)
+    assert isinstance(mask, torch.Tensor)
+    assert mask.ndim == 3 and mask.shape[0] == 1
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
+@pytest.mark.parametrize(
+    "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
+)
+def test_bbox_instance(data, format):
+    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
+    assert isinstance(bboxes, torch.Tensor)
+    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat.from_str(format.upper())
+    assert bboxes.format == format
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -22,12 +22,13 @@ import PIL
 import pytest
 import torch
 import torch.nn.functional as F
+from common_utils import combinations_grid
 from torchvision import datasets


 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.STL10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))

    @staticmethod
    def _make_binary_file(num_elements, root, name):
@@ -113,9 +114,7 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Caltech101
    FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        target_type=("category", "annotation", ["category", "annotation"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(target_type=("category", "annotation", ["category", "annotation"]))
    REQUIRED_PACKAGES = ("scipy",)

    def inject_fake_data(self, tmpdir, config):
@@ -208,7 +207,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
 class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.WIDERFace
    FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))

    def inject_fake_data(self, tmpdir, config):
        widerface_dir = pathlib.Path(tmpdir) / "widerface"
@@ -269,8 +268,8 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
        "color",
    )
    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
-        *datasets_utils.combinations_grid(
+        *combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
+        *combinations_grid(
            mode=("coarse",),
            split=("train", "train_extra", "val"),
            target_type=TARGET_TYPES,
@@ -387,7 +386,7 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
 class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.ImageNet
    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))

    def inject_fake_data(self, tmpdir, config):
        tmpdir = pathlib.Path(tmpdir)
@@ -417,7 +416,7 @@ class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):

 class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.CIFAR10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))

    _VERSION_CONFIG = dict(
        base_folder="cifar-10-batches-py",
@@ -490,7 +489,7 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.CelebA
    FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("train", "valid", "test", "all"),
        target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
    )
@@ -614,9 +613,7 @@ class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)

    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(
-            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
-        ),
+        *combinations_grid(year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")),
        dict(year="2007", image_set="test"),
    )

@@ -791,7 +788,7 @@ class CocoCaptionsTestCase(CocoDetectionTestCase):

    def _create_annotations(self, image_ids, num_annotations_per_image):
        captions = [str(idx) for idx in range(num_annotations_per_image)]
-        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
+        annotations = combinations_grid(image_id=image_ids, caption=captions)
        for id, annotation in enumerate(annotations):
            annotation["id"] = id
        return annotations, dict(captions=captions)
@@ -805,7 +802,7 @@ class CocoCaptionsTestCase(CocoDetectionTestCase):
 class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
    DATASET_CLASS = datasets.UCF101

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))

    _VIDEO_FOLDER = "videos"
    _ANNOTATIONS_FOLDER = "annotations"
@@ -866,9 +863,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.LSUN

    REQUIRED_PACKAGES = ("lmdb",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"]))

    _CATEGORIES = (
        "bedroom",
@@ -953,7 +948,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):

 class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
    DATASET_CLASS = datasets.Kinetics
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))

    def inject_fake_data(self, tmpdir, config):
        classes = ("Abseiling", "Zumba")
@@ -973,7 +968,7 @@ class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
 class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
    DATASET_CLASS = datasets.HMDB51

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))

    _VIDEO_FOLDER = "videos"
    _SPLITS_FOLDER = "splits"
@@ -1033,7 +1028,7 @@ class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
 class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Omniglot

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(background=(True, False))

    def inject_fake_data(self, tmpdir, config):
        target_folder = (
@@ -1113,7 +1108,7 @@ class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
 class USPSTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.USPS

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))

    def inject_fake_data(self, tmpdir, config):
        num_images = 2 if config["train"] else 1
@@ -1135,7 +1130,7 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):

    REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
    )

@@ -1221,7 +1216,7 @@ class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
    _TRAIN_FEATURE_TYPES = (torch.Tensor,)
    _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)

-    datasets_utils.combinations_grid(train=(True, False))
+    combinations_grid(train=(True, False))

    _NAME = "liberty"

@@ -1380,7 +1375,7 @@ class Flickr30kTestCase(Flickr8kTestCase):
 class MNISTTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.MNIST

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))

    _MAGIC_DTYPES = {
        torch.uint8: 8,
@@ -1450,7 +1445,7 @@ class EMNISTTestCase(MNISTTestCase):
    DATASET_CLASS = datasets.EMNIST

    DEFAULT_CONFIG = dict(split="byclass")
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("byclass", "bymerge", "balanced", "letters", "digits", "mnist"), train=(True, False)
    )

@@ -1461,7 +1456,7 @@ class EMNISTTestCase(MNISTTestCase):
 class QMNISTTestCase(MNISTTestCase):
    DATASET_CLASS = datasets.QMNIST

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(what=("train", "test", "test10k", "nist"))
+    ADDITIONAL_CONFIGS = combinations_grid(what=("train", "test", "test10k", "nist"))

    _LABELS_SIZE = (8,)
    _LABELS_DTYPE = torch.int32
@@ -1507,7 +1502,7 @@ class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
    DATASET_CLASS = datasets.MovingMNIST
    FEATURE_TYPES = (torch.Tensor,)

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
+    ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))

    def inject_fake_data(self, tmpdir, config):
        base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__)
@@ -1543,7 +1538,7 @@ class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
    # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the
    # 'test_is_valid_file()' method.
    DEFAULT_CONFIG = dict(extensions=_EXTENSIONS)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])
+    ADDITIONAL_CONFIGS = combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])

    def dataset_args(self, tmpdir, config):
        return tmpdir, datasets.folder.pil_loader
@@ -1612,7 +1607,7 @@ class ImageFolderTestCase(datasets_utils.ImageDatasetTestCase):
 class KittiTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Kitti
    FEATURE_TYPES = (PIL.Image.Image, (list, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))

    def inject_fake_data(self, tmpdir, config):
        kitti_dir = os.path.join(tmpdir, "Kitti", "raw")
@@ -1648,7 +1643,7 @@ class KittiTestCase(datasets_utils.ImageDatasetTestCase):
 class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.SVHN
    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "extra"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "extra"))

    def inject_fake_data(self, tmpdir, config):
        import scipy.io as sio
@@ -1669,7 +1664,7 @@ class SvhnTestCase(datasets_utils.ImageDatasetTestCase):

 class Places365TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Places365
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("train-standard", "train-challenge", "val"),
        small=(False, True),
    )
@@ -1761,7 +1756,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.INaturalist
    FEATURE_TYPES = (PIL.Image.Image, (int, tuple))

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]),
        version=("2021_train",),
    )
@@ -1798,7 +1793,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
 class LFWPeopleTestCase(datasets_utils.DatasetTestCase):
    DATASET_CLASS = datasets.LFWPeople
    FEATURE_TYPES = (PIL.Image.Image, int)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("10fold", "train", "test"), image_set=("original", "funneled", "deepfunneled")
    )
    _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"}
@@ -1874,7 +1869,7 @@ class LFWPairsTestCase(LFWPeopleTestCase):

 class SintelTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Sintel
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))

    FLOW_H, FLOW_W = 3, 4
@@ -1942,7 +1937,7 @@ class SintelTestCase(datasets_utils.ImageDatasetTestCase):

 class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.KittiFlow
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))

    def inject_fake_data(self, tmpdir, config):
@@ -2002,7 +1997,7 @@ class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):

 class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.FlyingChairs
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))

    FLOW_H, FLOW_W = 3, 4
@@ -2057,7 +2052,7 @@ class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):

 class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.FlyingThings3D
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("train", "test"), pass_name=("clean", "final", "both"), camera=("left", "right", "both")
    )
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
@@ -2194,7 +2189,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Food101
    FEATURE_TYPES = (PIL.Image.Image, int)

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))

    def inject_fake_data(self, tmpdir: str, config):
        root_folder = pathlib.Path(tmpdir) / "food-101"
@@ -2229,7 +2224,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):

 class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.FGVCAircraft
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer")
    )

@@ -2312,7 +2307,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.DTD
    FEATURE_TYPES = (PIL.Image.Image, int)

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("train", "test", "val"),
        # There is no need to test the whole matrix here, since each fold is treated exactly the same
        partition=(1, 5, 10),
@@ -2346,7 +2341,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):

 class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.FER2013
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))

    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))

@@ -2381,7 +2376,7 @@ class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.GTSRB
    FEATURE_TYPES = (PIL.Image.Image, int)

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))

    def inject_fake_data(self, tmpdir: str, config):
        root_folder = os.path.join(tmpdir, "gtsrb")
@@ -2431,7 +2426,7 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.CLEVRClassification
    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))

    def inject_fake_data(self, tmpdir, config):
        data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
@@ -2463,7 +2458,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.OxfordIIITPet
    FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("trainval", "test"),
        target_types=("category", "segmentation", ["category", "segmentation"], []),
    )
@@ -2522,7 +2517,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
 class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.StanfordCars
    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))

    def inject_fake_data(self, tmpdir, config):
        import scipy.io as io
@@ -2566,7 +2561,7 @@ class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
 class Country211TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Country211

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test"))

    def inject_fake_data(self, tmpdir: str, config):
        split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
@@ -2593,7 +2588,7 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase):
 class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Flowers102

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
    REQUIRED_PACKAGES = ("scipy",)

    def inject_fake_data(self, tmpdir: str, config):
@@ -2629,7 +2624,7 @@ class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
 class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.PCAM

-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
    REQUIRED_PACKAGES = ("h5py",)

    def inject_fake_data(self, tmpdir: str, config):
@@ -2651,7 +2646,7 @@ class PCAMTestCase(datasets_utils.ImageDatasetTestCase):

 class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.RenderedSST2
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
    SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}

    def inject_fake_data(self, tmpdir: str, config):
@@ -2673,7 +2668,7 @@ class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):

 class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Kitti2012Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))

    def inject_fake_data(self, tmpdir, config):
@@ -2735,7 +2730,7 @@ class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):

 class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Kitti2015Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))

    def inject_fake_data(self, tmpdir, config):
@@ -2873,7 +2868,7 @@ class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):

 class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.FallingThingsStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(variant=("single", "mixed", "both"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))

    @staticmethod
@@ -2947,7 +2942,7 @@ class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):

 class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.SceneFlowStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        variant=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final", "both")
    )
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
@@ -3034,7 +3029,7 @@ class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
 class InStereo2k(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.InStereo2k
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))

    @staticmethod
    def _make_scene_folder(root: str, name: str, size: Tuple[int, int]):
@@ -3076,7 +3071,7 @@ class InStereo2k(datasets_utils.ImageDatasetTestCase):

 class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.SintelStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(pass_name=("final", "clean", "both"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))

    def inject_fake_data(self, tmpdir, config):
@@ -3152,7 +3147,7 @@ class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):

 class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.ETH3DStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))

    @staticmethod
@@ -3219,7 +3214,7 @@ class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):

 class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
    DATASET_CLASS = datasets.Middlebury2014Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
        split=("train", "additional"),
        calibration=("perfect", "imperfect", "both"),
        use_ambient_views=(True, False),

--- a/test/test_prototype_datapoints.py
+++ b/test/test_prototype_datapoints.py
 import pytest
 import torch

-from PIL import Image
-
-from torchvision import datapoints
 from torchvision.prototype import datapoints as proto_datapoints


@@ -134,30 +131,3 @@ def test_wrap_like():
    assert type(label_new) is proto_datapoints.Label
    assert label_new.data_ptr() == output.data_ptr()
    assert label_new.categories is label.categories
-
-
-@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
-def test_image_instance(data):
-    image = datapoints.Image(data)
-    assert isinstance(image, torch.Tensor)
-    assert image.ndim == 3 and image.shape[0] == 3
-
-
-@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
-def test_mask_instance(data):
-    mask = datapoints.Mask(data)
-    assert isinstance(mask, torch.Tensor)
-    assert mask.ndim == 3 and mask.shape[0] == 1
-
-
-@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]]])
-@pytest.mark.parametrize(
-    "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
-)
-def test_bbox_instance(data, format):
-    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
-    assert isinstance(bboxes, torch.Tensor)
-    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat.from_str(format.upper())
-    assert bboxes.format == format
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
 import itertools
-import pathlib
-import random
-import re
-import warnings
-from collections import defaultdict

-import numpy as np
+import re

 import PIL.Image
 import pytest
 import torch
-import torchvision.prototype.datapoints as proto_datapoints
-import torchvision.prototype.transforms as proto_transforms
-import torchvision.transforms.v2 as transforms

-import torchvision.transforms.v2.utils
-from common_utils import cpu_and_gpu
-from prototype_common_utils import (
+from common_utils import (
    assert_equal,
    DEFAULT_EXTRA_DIMS,
    make_bounding_box,
-    make_bounding_boxes,
    make_detection_mask,
    make_image,
    make_images,
-    make_label,
-    make_one_hot_labels,
    make_segmentation_mask,
    make_video,
    make_videos,
 )
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
-from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
-from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2._utils import _convert_fill_arg
-from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
-
-BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
-
-
-def make_vanilla_tensor_images(*args, **kwargs):
-    for image in make_images(*args, **kwargs):
-        if image.ndim > 3:
-            continue
-        yield image.data
-

-def make_pil_images(*args, **kwargs):
-    for image in make_vanilla_tensor_images(*args, **kwargs):
-        yield to_pil_image(image)
+from prototype_common_utils import make_label, make_one_hot_labels

+from torchvision.datapoints import BoundingBox, BoundingBoxFormat, Image, Mask, Video
+from torchvision.prototype import datapoints, transforms
+from torchvision.transforms.v2._utils import _convert_fill_arg
+from torchvision.transforms.v2.functional import InterpolationMode, pil_to_tensor, to_image_pil
+from torchvision.transforms.v2.utils import check_type, is_simple_tensor

-def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_box in make_bounding_boxes(*args, **kwargs):
-        yield bounding_box.data
+BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]


 def parametrize(transforms_with_inputs):
@@ -73,1541 +44,47 @@ def parametrize(transforms_with_inputs):
    )


-def auto_augment_adapter(transform, input, device):
-    adapted_input = {}
-    image_or_video_found = False
-    for key, value in input.items():
-        if isinstance(value, (datapoints.BoundingBox, datapoints.Mask)):
-            # AA transforms don't support bounding boxes or masks
-            continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
-            if image_or_video_found:
-                # AA transforms only support a single image or video
-                continue
-            image_or_video_found = True
-        adapted_input[key] = value
-    return adapted_input
-
-
-def linear_transformation_adapter(transform, input, device):
-    flat_inputs = list(input.values())
-    c, h, w = query_chw(
-        [
-            item
-            for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs))
-            if needs_transform
-        ]
-    )
-    num_elements = c * h * w
-    transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device)
-    transform.mean_vector = torch.randn((num_elements,), device=device)
-    return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
-
-
-def normalize_adapter(transform, input, device):
-    adapted_input = {}
-    for key, value in input.items():
-        if isinstance(value, PIL.Image.Image):
-            # normalize doesn't support PIL images
-            continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
-            # normalize doesn't support integer images
-            value = F.convert_dtype(value, torch.float32)
-        adapted_input[key] = value
-    return adapted_input
-
-
-class TestSmoke:
-    @pytest.mark.parametrize(
-        ("transform", "adapter"),
-        [
-            (transforms.RandomErasing(p=1.0), None),
-            (transforms.AugMix(), auto_augment_adapter),
-            (transforms.AutoAugment(), auto_augment_adapter),
-            (transforms.RandAugment(), auto_augment_adapter),
-            (transforms.TrivialAugmentWide(), auto_augment_adapter),
-            (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
-            (transforms.Grayscale(), None),
-            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
-            (transforms.RandomAutocontrast(p=1.0), None),
-            (transforms.RandomEqualize(p=1.0), None),
-            (transforms.RandomGrayscale(p=1.0), None),
-            (transforms.RandomInvert(p=1.0), None),
-            (transforms.RandomPhotometricDistort(p=1.0), None),
-            (transforms.RandomPosterize(bits=4, p=1.0), None),
-            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
-            (transforms.CenterCrop([16, 16]), None),
-            (transforms.ElasticTransform(sigma=1.0), None),
-            (transforms.Pad(4), None),
-            (transforms.RandomAffine(degrees=30.0), None),
-            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
-            (transforms.RandomHorizontalFlip(p=1.0), None),
-            (transforms.RandomPerspective(p=1.0), None),
-            (transforms.RandomResize(min_size=10, max_size=20), None),
-            (transforms.RandomResizedCrop([16, 16]), None),
-            (transforms.RandomRotation(degrees=30), None),
-            (transforms.RandomShortestSize(min_size=10), None),
-            (transforms.RandomVerticalFlip(p=1.0), None),
-            (transforms.RandomZoomOut(p=1.0), None),
-            (transforms.Resize([16, 16], antialias=True), None),
-            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
-            (transforms.ClampBoundingBox(), None),
-            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
-            (transforms.ConvertDtype(), None),
-            (transforms.GaussianBlur(kernel_size=3), None),
-            (
-                transforms.LinearTransformation(
-                    # These are just dummy values that will be filled by the adapter. We can't define them upfront,
-                    # because for we neither know the spatial size nor the device at this point
-                    transformation_matrix=torch.empty((1, 1)),
-                    mean_vector=torch.empty((1,)),
-                ),
-                linear_transformation_adapter,
-            ),
-            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
-            (transforms.ToDtype(torch.float64), None),
-            (transforms.UniformTemporalSubsample(num_samples=2), None),
-        ],
-        ids=lambda transform: type(transform).__name__,
-    )
-    @pytest.mark.parametrize("container_type", [dict, list, tuple])
-    @pytest.mark.parametrize(
-        "image_or_video",
-        [
-            make_image(),
-            make_video(),
-            next(make_pil_images(color_spaces=["RGB"])),
-            next(make_vanilla_tensor_images()),
-        ],
-    )
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_common(self, transform, adapter, container_type, image_or_video, device):
-        spatial_size = F.get_spatial_size(image_or_video)
-        input = dict(
-            image_or_video=image_or_video,
-            image_datapoint=make_image(size=spatial_size),
-            video_datapoint=make_video(size=spatial_size),
-            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
-            bounding_box_xyxy=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
-            ),
-            bounding_box_xywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
-            ),
-            bounding_box_cxcywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
-            ),
-            bounding_box_degenerate_xyxy=datapoints.BoundingBox(
-                [
-                    [0, 0, 0, 0],  # no height or width
-                    [0, 0, 0, 1],  # no height
-                    [0, 0, 1, 0],  # no width
-                    [2, 0, 1, 1],  # x1 > x2, y1 < y2
-                    [0, 2, 1, 1],  # x1 < x2, y1 > y2
-                    [2, 2, 1, 1],  # x1 > x2, y1 > y2
-                ],
-                format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=spatial_size,
-            ),
-            bounding_box_degenerate_xywh=datapoints.BoundingBox(
-                [
-                    [0, 0, 0, 0],  # no height or width
-                    [0, 0, 0, 1],  # no height
-                    [0, 0, 1, 0],  # no width
-                    [0, 0, 1, -1],  # negative height
-                    [0, 0, -1, 1],  # negative width
-                    [0, 0, -1, -1],  # negative height and width
-                ],
-                format=datapoints.BoundingBoxFormat.XYWH,
-                spatial_size=spatial_size,
-            ),
-            bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
-                [
-                    [0, 0, 0, 0],  # no height or width
-                    [0, 0, 0, 1],  # no height
-                    [0, 0, 1, 0],  # no width
-                    [0, 0, 1, -1],  # negative height
-                    [0, 0, -1, 1],  # negative width
-                    [0, 0, -1, -1],  # negative height and width
-                ],
-                format=datapoints.BoundingBoxFormat.CXCYWH,
-                spatial_size=spatial_size,
-            ),
-            detection_mask=make_detection_mask(size=spatial_size),
-            segmentation_mask=make_segmentation_mask(size=spatial_size),
-            int=0,
-            float=0.0,
-            bool=True,
-            none=None,
-            str="str",
-            path=pathlib.Path.cwd(),
-            object=object(),
-            tensor=torch.empty(5),
-            array=np.empty(5),
+@parametrize(
+    [
+        (
+            transform,
+            [
+                dict(inpt=inpt, one_hot_label=one_hot_label)
+                for inpt, one_hot_label in itertools.product(
+                    itertools.chain(
+                        make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                        make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                    ),
+                    make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
+                )
+            ],
        )
-        if adapter is not None:
-            input = adapter(transform, input, device)
-
-        if container_type in {tuple, list}:
-            input = container_type(input.values())
-
-        input_flat, input_spec = tree_flatten(input)
-        input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat]
-        input = tree_unflatten(input_flat, input_spec)
-
-        torch.manual_seed(0)
-        output = transform(input)
-        output_flat, output_spec = tree_flatten(output)
-
-        assert output_spec == input_spec
-
-        for output_item, input_item, should_be_transformed in zip(
-            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
-        ):
-            if should_be_transformed:
-                assert type(output_item) is type(input_item)
-            else:
-                assert output_item is input_item
-
-            if isinstance(input_item, datapoints.BoundingBox) and not isinstance(
-                transform, transforms.ConvertBoundingBoxFormat
-            ):
-                assert output_item.format == input_item.format
-
-        # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
-        # transform that does this), back into a valid one.
-        # TODO: we should test that against all degenerate boxes above
-        for format in list(datapoints.BoundingBoxFormat):
-            sample = dict(
-                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
-                labels=torch.tensor([3]),
-            )
-            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
-
-    @parametrize(
-        [
-            (
-                transform,
-                [
-                    dict(inpt=inpt, one_hot_label=one_hot_label)
-                    for inpt, one_hot_label in itertools.product(
-                        itertools.chain(
-                            make_images(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                            make_videos(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                        ),
-                        make_one_hot_labels(extra_dims=BATCH_EXTRA_DIMS, dtypes=[torch.float]),
-                    )
-                ],
-            )
-            for transform in [
-                proto_transforms.RandomMixup(alpha=1.0),
-                proto_transforms.RandomCutmix(alpha=1.0),
-            ]
-        ]
-    )
-    def test_mixup_cutmix(self, transform, input):
-        transform(input)
-
-        # add other data that should bypass and won't raise any error
-        input_copy = dict(input)
-        input_copy["path"] = "/path/to/somewhere"
-        input_copy["num"] = 1234
-        transform(input_copy)
-
-        # Check if we raise an error if sample contains bbox or mask or label
-        err_msg = "does not support PIL images, bounding boxes, masks and plain labels"
-        input_copy = dict(input)
-        for unsup_data in [
-            make_label(),
-            make_bounding_box(format="XYXY"),
-            make_detection_mask(),
-            make_segmentation_mask(),
-        ]:
-            input_copy["unsupported"] = unsup_data
-            with pytest.raises(TypeError, match=err_msg):
-                transform(input_copy)
-
-    @parametrize(
-        [
-            (
-                transform,
-                itertools.chain.from_iterable(
-                    fn(
-                        color_spaces=[
-                            "GRAY",
-                            "RGB",
-                        ],
-                        dtypes=[torch.uint8],
-                        extra_dims=[(), (4,)],
-                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
-                    )
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                        make_pil_images,
-                        make_videos,
-                    ]
-                ),
-            )
-            for transform in (
-                transforms.RandAugment(),
-                transforms.TrivialAugmentWide(),
-                transforms.AutoAugment(),
-                transforms.AugMix(),
-            )
-        ]
-    )
-    def test_auto_augment(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
-                itertools.chain.from_iterable(
-                    fn(color_spaces=["RGB"], dtypes=[torch.float32])
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                        make_videos,
-                    ]
-                ),
-            ),
+        for transform in [
+            transforms.RandomMixup(alpha=1.0),
+            transforms.RandomCutmix(alpha=1.0),
        ]
-    )
-    def test_normalize(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.RandomResizedCrop([16, 16], antialias=True),
-                itertools.chain(
-                    make_images(extra_dims=[(4,)]),
-                    make_vanilla_tensor_images(),
-                    make_pil_images(),
-                    make_videos(extra_dims=[()]),
-                ),
-            )
-        ]
-    )
-    def test_random_resized_crop(self, transform, input):
-        transform(input)
-
-
-@pytest.mark.parametrize(
-    "flat_inputs",
-    itertools.permutations(
-        [
-            next(make_vanilla_tensor_images()),
-            next(make_vanilla_tensor_images()),
-            next(make_pil_images()),
-            make_image(),
-            next(make_videos()),
-        ],
-        3,
-    ),
+    ]
 )
-def test_simple_tensor_heuristic(flat_inputs):
-    def split_on_simple_tensor(to_split):
-        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
-        # 1. The first simple tensor. If none is present, this will be `None`
-        # 2. A list of the remaining simple tensors
-        # 3. A list of all other items
-        simple_tensors = []
-        others = []
-        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
-        # affect the splitting.
-        for item, inpt in zip(to_split, flat_inputs):
-            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
-        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
-
-    class CopyCloneTransform(transforms.Transform):
-        def _transform(self, inpt, params):
-            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
-
-        @staticmethod
-        def was_applied(output, inpt):
-            identity = output is inpt
-            if identity:
-                return False
-
-            # Make sure nothing fishy is going on
-            assert_equal(output, inpt)
-            return True
-
-    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
-
-    transform = CopyCloneTransform()
-    transformed_sample = transform(flat_inputs)
-
-    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
-
-    if first_simple_tensor_input is not None:
-        if other_inputs:
-            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
-        else:
-            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
-
-    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
-        assert not transform.was_applied(output, inpt)
-
-    for input, output in zip(other_inputs, other_outputs):
-        assert transform.was_applied(output, input)
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomHorizontalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
-        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_datapoints_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(datapoints.Image(input))
-
-        assert_equal(datapoints.Image(expected), actual)
-
-    def test_datapoints_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(datapoints.Mask(input))
-
-        assert_equal(datapoints.Mask(expected), actual)
-
-    def test_datapoints_bounding_box(self, p):
-        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.spatial_size == expected.spatial_size
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomVerticalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
-        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_datapoints_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(datapoints.Image(input))
-
-        assert_equal(datapoints.Image(expected), actual)
-
-    def test_datapoints_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(datapoints.Mask(input))
-
-        assert_equal(datapoints.Mask(expected), actual)
-
-    def test_datapoints_bounding_box(self, p):
-        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.spatial_size == expected.spatial_size
-
-
-class TestPad:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.Pad("abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.Pad([-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.Pad(12, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.Pad(12, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, fill, padding_mode, mocker):
-        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        _ = transform(inpt)
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        if isinstance(padding, tuple):
-            padding = list(padding)
-        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
-
-    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.Pad(1, fill=fill, padding_mode="constant")
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        image = datapoints.Image(torch.rand(3, 32, 32))
-        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-        _ = transform(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms._utils._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
-            ]
-        else:
-            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
-            ]
-        fn.assert_has_calls(calls)
-
-
-class TestRandomZoomOut:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomZoomOut(fill="abc")
-
-        with pytest.raises(TypeError, match="should be a sequence of length"):
-            transforms.RandomZoomOut(0, side_range=0)
-
-        with pytest.raises(ValueError, match="Invalid canvas side range"):
-            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__get_params(self, fill, side_range, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
-
-        image = mocker.MagicMock(spec=datapoints.Image)
-        h, w = image.spatial_size = (24, 32)
-
-        params = transform._get_params([image])
-
-        assert len(params["padding"]) == 4
-        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
-        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill)
-
-    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, p=1.0)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
-        image = datapoints.Image(torch.rand(3, 32, 32))
-        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms._utils._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, **params, fill=fill),
-                mocker.call(mask, **params, fill=fill),
-            ]
-        else:
-            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, **params, fill=fill_img),
-                mocker.call(mask, **params, fill=fill_mask),
-            ]
-        fn.assert_has_calls(calls)
-
-
-class TestRandomRotation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomRotation(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomRotation(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomRotation(12, fill="abc")
-
-        with pytest.raises(TypeError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=12)
-
-        with pytest.raises(ValueError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=[1, 2, 3])
-
-    def test__get_params(self):
-        angle_bound = 34
-        transform = transforms.RandomRotation(angle_bound)
-
-        params = transform._get_params(None)
-        assert -angle_bound <= params["angle"] <= angle_bound
-
-        angle_bounds = [12, 34]
-        transform = transforms.RandomRotation(angle_bounds)
-
-        params = transform._get_params(None)
-        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("expand", [False, True])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, expand, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomRotation(
-            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.rotate")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
-
-    @pytest.mark.parametrize("angle", [34, -87])
-    @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_spatial_size(self, angle, expand):
-        # Specific test for BoundingBox.rotate
-        bbox = datapoints.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
-        )
-        img = datapoints.Image(torch.rand(1, 3, 32, 32))
-
-        out_img = img.rotate(angle, expand=expand)
-        out_bbox = bbox.rotate(angle, expand=expand)
-
-        assert out_img.spatial_size == out_bbox.spatial_size
-
-
-class TestRandomAffine:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomAffine(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        for kwargs in [
-            {"center": 12},
-            {"translate": 12},
-            {"scale": 12},
-        ]:
-            with pytest.raises(TypeError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
-            with pytest.raises(ValueError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
-            transforms.RandomAffine(12, translate=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="scale values should be positive"):
-            transforms.RandomAffine(12, scale=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(12, shear=-10)
-
-        for s in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
-                transforms.RandomAffine(12, shear=s)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    def test__get_params(self, degrees, translate, scale, shear, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
-
-        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
-        params = transform._get_params([image])
-
-        if not isinstance(degrees, (list, tuple)):
-            assert -degrees <= params["angle"] <= degrees
-        else:
-            assert degrees[0] <= params["angle"] <= degrees[1]
-
-        if translate is not None:
-            w_max = int(round(translate[0] * w))
-            h_max = int(round(translate[1] * h))
-            assert -w_max <= params["translate"][0] <= w_max
-            assert -h_max <= params["translate"][1] <= h_max
-        else:
-            assert params["translate"] == (0, 0)
-
-        if scale is not None:
-            assert scale[0] <= params["scale"] <= scale[1]
-        else:
-            assert params["scale"] == 1.0
-
-        if shear is not None:
-            if isinstance(shear, float):
-                assert -shear <= params["shear"][0] <= shear
-                assert params["shear"][1] == 0.0
-            elif len(shear) == 2:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert params["shear"][1] == 0.0
-            else:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert shear[2] <= params["shear"][1] <= shear[3]
-        else:
-            assert params["shear"] == (0, 0)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomAffine(
-            degrees,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.affine")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
-
-
-class TestRandomCrop:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Please provide only two dimensions"):
-            transforms.RandomCrop([10, 12, 14])
-
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.RandomCrop([10, 12], padding="abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomCrop([10, 12], padding=1, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
-
-        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
-        params = transform._get_params([image])
-
-        if padding is not None:
-            if isinstance(padding, int):
-                pad_top = pad_bottom = pad_left = pad_right = padding
-            elif isinstance(padding, list) and len(padding) == 2:
-                pad_left = pad_right = padding[0]
-                pad_top = pad_bottom = padding[1]
-            elif isinstance(padding, list) and len(padding) == 4:
-                pad_left, pad_top, pad_right, pad_bottom = padding
-
-            h += pad_top + pad_bottom
-            w += pad_left + pad_right
-        else:
-            pad_left = pad_right = pad_top = pad_bottom = 0
-
-        if pad_if_needed:
-            if w < size[1]:
-                diff = size[1] - w
-                pad_left += diff
-                pad_right += diff
-                w += 2 * diff
-            if h < size[0]:
-                diff = size[0] - h
-                pad_top += diff
-                pad_bottom += diff
-                h += 2 * diff
-
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-
-        assert 0 <= params["top"] <= h - size[0] + 1
-        assert 0 <= params["left"] <= w - size[1] + 1
-        assert params["height"] == size[0]
-        assert params["width"] == size[1]
-        assert params["needs_pad"] is any(padding)
-        assert params["padding"] == padding
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("pad_if_needed", [False, True])
-    @pytest.mark.parametrize("fill", [False, True])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
-        output_size = [10, 12]
-        transform = transforms.RandomCrop(
-            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
-        )
-
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (32, 32)
-
-        expected = mocker.MagicMock(spec=datapoints.Image)
-        expected.num_channels = 3
-        if isinstance(padding, int):
-            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
-        elif isinstance(padding, list):
-            expected.spatial_size = (
-                inpt.spatial_size[0] + sum(padding[0::2]),
-                inpt.spatial_size[1] + sum(padding[1::2]),
-            )
-        else:
-            expected.spatial_size = inpt.spatial_size
-        _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
-        fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-        if padding is None and not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif padding is None:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-        else:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-
-
-class TestGaussianBlur:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
-            transforms.GaussianBlur([10, 12, 14])
-
-        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
-            transforms.GaussianBlur(4)
-
-        with pytest.raises(
-            TypeError, match="sigma should be a single int or float or a list/tuple with length 2 floats."
-        ):
-            transforms.GaussianBlur(3, sigma=[1, 2, 3])
-
-        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
-            transforms.GaussianBlur(3, sigma=-1.0)
-
-        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
-            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
-
-    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
-    def test__get_params(self, sigma):
-        transform = transforms.GaussianBlur(3, sigma=sigma)
-        params = transform._get_params([])
-
-        if isinstance(sigma, float):
-            assert params["sigma"][0] == params["sigma"][1] == 10
-        else:
-            assert sigma[0] <= params["sigma"][0] <= sigma[1]
-            assert sigma[0] <= params["sigma"][1] <= sigma[1]
-
-    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
-    def test__transform(self, kernel_size, sigma, mocker):
-        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
-
-        if isinstance(kernel_size, (tuple, list)):
-            assert transform.kernel_size == kernel_size
-        else:
-            kernel_size = (kernel_size, kernel_size)
-            assert transform.kernel_size == kernel_size
-
-        if isinstance(sigma, (tuple, list)):
-            assert transform.sigma == sigma
-        else:
-            assert transform.sigma == [sigma, sigma]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params([inpt])
-
-        fn.assert_called_once_with(inpt, kernel_size, **params)
-
-
-class TestRandomColorOp:
-    @pytest.mark.parametrize("p", [0.0, 1.0])
-    @pytest.mark.parametrize(
-        "transform_cls, func_op_name, kwargs",
-        [
-            (transforms.RandomEqualize, "equalize", {}),
-            (transforms.RandomInvert, "invert", {}),
-            (transforms.RandomAutocontrast, "autocontrast", {}),
-            (transforms.RandomPosterize, "posterize", {"bits": 4}),
-            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
-            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
-        ],
-    )
-    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
-        transform = transform_cls(p=p, **kwargs)
-
-        fn = mocker.patch(f"torchvision.transforms.v2.functional.{func_op_name}")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        _ = transform(inpt)
-        if p > 0.0:
-            fn.assert_called_once_with(inpt, **kwargs)
-        else:
-            assert fn.call_count == 0
-
-
-class TestRandomPerspective:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
-            transforms.RandomPerspective(distortion_scale=-1.0)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomPerspective(0.5, fill="abc")
-
-    def test__get_params(self, mocker):
-        dscale = 0.5
-        transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        params = transform._get_params([image])
-
-        h, w = image.spatial_size
-        assert "coefficients" in params
-        assert len(params["coefficients"]) == 8
-
-    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
-    def test__transform(self, distortion_scale, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params([inpt])
-
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
-
-
-class TestElasticTransform:
-    def test_assertions(self):
-
-        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
-            transforms.ElasticTransform({})
-
-        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
-            transforms.ElasticTransform([1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
-            transforms.ElasticTransform([1, 2])
-
-        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
-            transforms.ElasticTransform(1.0, {})
-
-        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
-            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
-            transforms.ElasticTransform(1.0, [1, 2])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.ElasticTransform(1.0, 2.0, fill="abc")
-
-    def test__get_params(self, mocker):
-        alpha = 2.0
-        sigma = 3.0
-        transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        params = transform._get_params([image])
-
-        h, w = image.spatial_size
-        displacement = params["displacement"]
-        assert displacement.shape == (1, h, w, 2)
-        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
-        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
-
-    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
-    def test__transform(self, alpha, sigma, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
-
-        if isinstance(alpha, float):
-            assert transform.alpha == [alpha, alpha]
-        else:
-            assert transform.alpha == alpha
-
-        if isinstance(sigma, float):
-            assert transform.sigma == [sigma, sigma]
-        else:
-            assert transform.sigma == sigma
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
-
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock()
-        _ = transform(inpt)
-        params = transform._get_params([inpt])
-        fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
-
-class TestRandomErasing:
-    def test_assertions(self, mocker):
-        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
-            transforms.RandomErasing(value={})
-
-        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
-            transforms.RandomErasing(value="abc")
-
-        with pytest.raises(TypeError, match="Scale should be a sequence"):
-            transforms.RandomErasing(scale=123)
-
-        with pytest.raises(TypeError, match="Ratio should be a sequence"):
-            transforms.RandomErasing(ratio=123)
-
-        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
-            transforms.RandomErasing(scale=[-1, 2])
-
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
-
-        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
-            transform._get_params([image])
-
-    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=value)
-        params = transform._get_params([image])
-
-        v = params["v"]
-        h, w = params["h"], params["w"]
-        i, j = params["i"], params["j"]
-        assert isinstance(v, torch.Tensor)
-        if value == "random":
-            assert v.shape == (image.num_channels, h, w)
-        elif isinstance(value, (int, float)):
-            assert v.shape == (1, 1, 1)
-        elif isinstance(value, (list, tuple)):
-            assert v.shape == (image.num_channels, 1, 1)
-
-        assert 0 <= i <= image.spatial_size[0] - h
-        assert 0 <= j <= image.spatial_size[1] - w
-
-    @pytest.mark.parametrize("p", [0, 1])
-    def test__transform(self, mocker, p):
-        transform = transforms.RandomErasing(p=p)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        i_sentinel = mocker.MagicMock()
-        j_sentinel = mocker.MagicMock()
-        h_sentinel = mocker.MagicMock()
-        w_sentinel = mocker.MagicMock()
-        v_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._augment.RandomErasing._get_params",
-            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._augment.F.erase")
-        output = transform(inpt_sentinel)
-
-        if p:
-            mock.assert_called_once_with(
-                inpt_sentinel,
-                i=i_sentinel,
-                j=j_sentinel,
-                h=h_sentinel,
-                w=w_sentinel,
-                v=v_sentinel,
-                inplace=transform.inplace,
-            )
-        else:
-            mock.assert_not_called()
-            assert output is inpt_sentinel
-
-
-class TestTransform:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test_check_transformed_types(self, inpt_type, mocker):
-        # This test ensures that we correctly handle which types to transform and which to bypass
-        t = transforms.Transform()
-        inpt = mocker.MagicMock(spec=inpt_type)
-
-        if inpt_type in (np.ndarray, str, int):
-            output = t(inpt)
-            assert output is inpt
-        else:
-            with pytest.raises(NotImplementedError):
-                t(inpt)
-
-
-class TestToImageTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch(
-            "torchvision.transforms.v2.functional.to_image_tensor",
-            return_value=torch.rand(1, 3, 8, 8),
-        )
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImageTensor()
-        transform(inpt)
-        if inpt_type in (datapoints.BoundingBox, datapoints.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestToImagePIL:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImagePIL()
-        transform(inpt)
-        if inpt_type in (datapoints.BoundingBox, PIL.Image.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToPILImage:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToPILImage()
-        transform(inpt)
-        if inpt_type in (PIL.Image.Image, datapoints.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        with pytest.warns(UserWarning, match="deprecated and will be removed"):
-            transform = transforms.ToTensor()
-        transform(inpt)
-        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestContainers:
-    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
-    def test_assertions(self, transform_cls):
-        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
-            transform_cls(transforms.RandomCrop(28))
-
-    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
-    @pytest.mark.parametrize(
-        "trfms",
-        [
-            [transforms.Pad(2), transforms.RandomCrop(28)],
-            [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
-            [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)],
-        ],
-    )
-    def test_ctor(self, transform_cls, trfms):
-        c = transform_cls(trfms)
-        inpt = torch.rand(1, 3, 32, 32)
-        output = c(inpt)
-        assert isinstance(output, torch.Tensor)
-        assert output.ndim == 4
-
-
-class TestRandomChoice:
-    def test_assertions(self):
-        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
-
-        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
-
-
-class TestRandomIoUCrop:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
-    def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        bboxes = datapoints.BoundingBox(
-            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
-            format="XYXY",
-            spatial_size=image.spatial_size,
-            device=device,
-        )
-        sample = [image, bboxes]
-
-        transform = transforms.RandomIoUCrop(sampler_options=options)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params(sample)
-
-            if options == [2.0]:
-                assert len(params) == 0
-                return
-
-            assert len(params["is_within_crop_area"]) > 0
-            assert params["is_within_crop_area"].dtype == torch.bool
-
-            orig_h = image.spatial_size[0]
-            orig_w = image.spatial_size[1]
-            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
-            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
-
-            left, top = params["left"], params["top"]
-            new_h, new_w = params["height"], params["width"]
-            ious = box_iou(
-                bboxes,
-                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
-            )
-            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
-
-    def test__transform_empty_params(self, mocker):
-        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
-        image = datapoints.Image(torch.rand(1, 3, 4, 4))
-        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
-        label = proto_datapoints.Label(torch.tensor([1]))
-        sample = [image, bboxes, label]
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock(return_value={})
-        output = transform(sample)
-        torch.testing.assert_close(output, sample)
-
-    def test_forward_assertion(self):
-        transform = transforms.RandomIoUCrop()
-        with pytest.raises(
-            TypeError,
-            match="requires input sample to contain tensor or PIL images and bounding boxes",
-        ):
-            transform(torch.tensor(0))
-
-    def test__transform(self, mocker):
-        transform = transforms.RandomIoUCrop()
-
-        image = datapoints.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
-        masks = make_detection_mask((32, 24), num_objects=6)
-
-        sample = [image, bboxes, masks]
-
-        fn = mocker.patch("torchvision.transforms.v2.functional.crop", side_effect=lambda x, **params: x)
-        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
-
-        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
-        transform._get_params = mocker.MagicMock(return_value=params)
-        output = transform(sample)
-
-        assert fn.call_count == 3
-
-        expected_calls = [
-            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-        ]
-
-        fn.assert_has_calls(expected_calls)
-
-        # check number of bboxes vs number of labels:
-        output_bboxes = output[1]
-        assert isinstance(output_bboxes, datapoints.BoundingBox)
-        assert (output_bboxes[~is_within_crop_area] == 0).all()
-
-        output_masks = output[2]
-        assert isinstance(output_masks, datapoints.Mask)
-
-
-class TestScaleJitter:
-    def test__get_params(self, mocker):
-        spatial_size = (24, 32)
-        target_size = (16, 12)
-        scale_range = (0.5, 1.5)
-
-        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params([sample])
-
-            assert "size" in params
-            size = params["size"]
-
-            assert isinstance(size, tuple) and len(size) == 2
-            height, width = size
-
-            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
-
-            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
-            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.ScaleJitter(
-            target_size=(16, 12), interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
-
-class TestRandomShortestSize:
-    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
-    def test__get_params(self, min_size, max_size, mocker):
-        spatial_size = (3, 10)
-
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
-
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
-        params = transform._get_params([sample])
-
-        assert "size" in params
-        size = params["size"]
-
-        assert isinstance(size, tuple) and len(size) == 2
-
-        longer = max(size)
-        shorter = min(size)
-        if max_size is not None:
-            assert longer <= max_size
-            assert shorter <= max_size
-        else:
-            assert shorter in min_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomShortestSize(
-            min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.RandomShortestSize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
+def test_mixup_cutmix(transform, input):
+    transform(input)
+
+    input_copy = dict(input)
+    input_copy["path"] = "/path/to/somewhere"
+    input_copy["num"] = 1234
+    transform(input_copy)
+
+    # Check if we raise an error if sample contains bbox or mask or label
+    err_msg = "does not support PIL images, bounding boxes, masks and plain labels"
+    input_copy = dict(input)
+    for unsup_data in [
+        make_label(),
+        make_bounding_box(format="XYXY"),
+        make_detection_mask(),
+        make_segmentation_mask(),
+    ]:
+        input_copy["unsupported"] = unsup_data
+        with pytest.raises(TypeError, match=err_msg):
+            transform(input_copy)


 class TestSimpleCopyPaste:
@@ -1617,27 +94,27 @@ class TestSimpleCopyPaste:
        return mocker.MagicMock(spec=image_type)

    def test__extract_image_targets_assertion(self, mocker):
-        transform = proto_transforms.SimpleCopyPaste()
+        transform = transforms.SimpleCopyPaste()

        flat_sample = [
            # images, batch size = 2
-            self.create_fake_image(mocker, datapoints.Image),
+            self.create_fake_image(mocker, Image),
            # labels, bboxes, masks
-            mocker.MagicMock(spec=proto_datapoints.Label),
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=datapoints.Label),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
            # labels, bboxes, masks
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
        ]

        with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"):
            transform._extract_image_targets(flat_sample)

-    @pytest.mark.parametrize("image_type", [datapoints.Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [proto_datapoints.Label, proto_datapoints.OneHotLabel])
+    @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor])
+    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
    def test__extract_image_targets(self, image_type, label_type, mocker):
-        transform = proto_transforms.SimpleCopyPaste()
+        transform = transforms.SimpleCopyPaste()

        flat_sample = [
            # images, batch size = 2
@@ -1645,12 +122,12 @@ class TestSimpleCopyPaste:
            self.create_fake_image(mocker, image_type),
            # labels, bboxes, masks
            mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
            # labels, bboxes, masks
            mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=datapoints.BoundingBox),
-            mocker.MagicMock(spec=datapoints.Mask),
+            mocker.MagicMock(spec=BoundingBox),
+            mocker.MagicMock(spec=Mask),
        ]

        images, targets = transform._extract_image_targets(flat_sample)
@@ -1665,15 +142,15 @@ class TestSimpleCopyPaste:

        for target in targets:
            for key, type_ in [
-                ("boxes", datapoints.BoundingBox),
-                ("masks", datapoints.Mask),
+                ("boxes", BoundingBox),
+                ("masks", Mask),
                ("labels", label_type),
            ]:
                assert key in target
                assert isinstance(target[key], type_)
                assert target[key] in flat_sample

-    @pytest.mark.parametrize("label_type", [proto_datapoints.Label, proto_datapoints.OneHotLabel])
+    @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel])
    def test__copy_paste(self, label_type):
        image = 2 * torch.ones(3, 32, 32)
        masks = torch.zeros(2, 32, 32)
@@ -1683,13 +160,13 @@ class TestSimpleCopyPaste:
        blending = True
        resize_interpolation = InterpolationMode.BILINEAR
        antialias = None
-        if label_type == proto_datapoints.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
            labels = torch.nn.functional.one_hot(labels, num_classes=5)
        target = {
-            "boxes": datapoints.BoundingBox(
+            "boxes": BoundingBox(
                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
            ),
-            "masks": datapoints.Mask(masks),
+            "masks": Mask(masks),
            "labels": label_type(labels),
        }

@@ -1698,17 +175,17 @@ class TestSimpleCopyPaste:
        paste_masks[0, 13:19, 12:18] = 1
        paste_masks[1, 15:19, 1:8] = 1
        paste_labels = torch.tensor([3, 4])
-        if label_type == proto_datapoints.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
            paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
        paste_target = {
-            "boxes": datapoints.BoundingBox(
+            "boxes": BoundingBox(
                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
            ),
-            "masks": datapoints.Mask(paste_masks),
+            "masks": Mask(paste_masks),
            "labels": label_type(paste_labels),
        }

-        transform = proto_transforms.SimpleCopyPaste()
+        transform = transforms.SimpleCopyPaste()
        random_selection = torch.tensor([0, 1])
        output_image, output_target = transform._copy_paste(
            image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias
@@ -1720,7 +197,7 @@ class TestSimpleCopyPaste:
        torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])

        expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == proto_datapoints.OneHotLabel:
+        if label_type == datapoints.OneHotLabel:
            expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
        torch.testing.assert_close(output_target["labels"], label_type(expected_labels))

@@ -1735,13 +212,11 @@ class TestFixedSizeCrop:
        batch_shape = (10,)
        spatial_size = (11, 5)

-        transform = proto_transforms.FixedSizeCrop(size=crop_size)
+        transform = transforms.FixedSizeCrop(size=crop_size)

        flat_inputs = [
            make_image(size=spatial_size, color_space="RGB"),
-            make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
-            ),
+            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape),
        ]
        params = transform._get_params(flat_inputs)

@@ -1763,7 +238,7 @@ class TestFixedSizeCrop:
        fill_sentinel = 12
        padding_mode_sentinel = mocker.MagicMock()

-        transform = proto_transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
+        transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
        transform._transformed_types = (mocker.MagicMock,)
        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)

@@ -1837,12 +312,12 @@ class TestFixedSizeCrop:
        )

        bounding_boxes = make_bounding_box(
-            format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
        )
        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
        labels = make_label(extra_dims=(batch_size,))

-        transform = proto_transforms.FixedSizeCrop((-1, -1))
+        transform = transforms.FixedSizeCrop((-1, -1))
        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)

        output = transform(
@@ -1875,11 +350,11 @@ class TestFixedSizeCrop:
        )

        bounding_box = make_bounding_box(
-            format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
        )
        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")

-        transform = proto_transforms.FixedSizeCrop((-1, -1))
+        transform = transforms.FixedSizeCrop((-1, -1))
        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)

        transform(bounding_box)
@@ -1887,178 +362,48 @@ class TestFixedSizeCrop:
        mock.assert_called_once()


-class TestLinearTransformation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="transformation_matrix should be square"):
-            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
-
-        with pytest.raises(ValueError, match="mean_vector should have the same length"):
-            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
-
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            122 * torch.ones(1, 3, 8, 8),
-            122.0 * torch.ones(1, 3, 8, 8),
-            datapoints.Image(122 * torch.ones(1, 3, 8, 8)),
-            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
-        ],
-    )
-    def test__transform(self, inpt):
-
-        v = 121 * torch.ones(3 * 8 * 8)
-        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
-        transform = transforms.LinearTransformation(m, v)
-
-        if isinstance(inpt, PIL.Image.Image):
-            with pytest.raises(TypeError, match="LinearTransformation does not work on PIL Images"):
-                transform(inpt)
-        else:
-            output = transform(inpt)
-            assert isinstance(output, torch.Tensor)
-            assert output.unique() == 3 * 8 * 8
-            assert output.dtype == inpt.dtype
-
-
 class TestLabelToOneHot:
    def test__transform(self):
        categories = ["apple", "pear", "pineapple"]
-        labels = proto_datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
-        transform = proto_transforms.LabelToOneHot()
+        labels = datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
+        transform = transforms.LabelToOneHot()
        ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, proto_datapoints.OneHotLabel)
+        assert isinstance(ohe_labels, datapoints.OneHotLabel)
        assert ohe_labels.shape == (4, 3)
        assert ohe_labels.categories == labels.categories == categories


-class TestRandomResize:
-    def test__get_params(self):
-        min_size = 3
-        max_size = 6
-
-        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
-
-        for _ in range(10):
-            params = transform._get_params([])
-
-            assert isinstance(params["size"], list) and len(params["size"]) == 1
-            size = params["size"][0]
-
-            assert min_size <= size < max_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomResize(
-            min_size=-1, max_size=-1, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.transforms.v2._geometry.RandomResize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_resize = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock_resize.assert_called_with(
-            inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
-
-class TestToDtype:
-    @pytest.mark.parametrize(
-        ("dtype", "expected_dtypes"),
-        [
-            (
-                torch.float64,
-                {
-                    datapoints.Video: torch.float64,
-                    datapoints.Image: torch.float64,
-                    datapoints.BoundingBox: torch.float64,
-                },
-            ),
-            (
-                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
-            ),
-        ],
-    )
-    def test_call(self, dtype, expected_dtypes):
-        sample = dict(
-            video=make_video(dtype=torch.int64),
-            image=make_image(dtype=torch.uint8),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
-            str="str",
-            int=0,
-        )
-
-        transform = transforms.ToDtype(dtype)
-        transformed_sample = transform(sample)
-
-        for key, value in sample.items():
-            value_type = type(value)
-            transformed_value = transformed_sample[key]
-
-            # make sure the transformation retains the type
-            assert isinstance(transformed_value, value_type)
-
-            if isinstance(value, torch.Tensor):
-                assert transformed_value.dtype is expected_dtypes[value_type]
-            else:
-                assert transformed_value is value
-
-    @pytest.mark.filterwarnings("error")
-    def test_plain_tensor_call(self):
-        tensor = torch.empty((), dtype=torch.float32)
-        transform = transforms.ToDtype({torch.Tensor: torch.float64})
-
-        assert transform(tensor).dtype is torch.float64
-
-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
-    def test_plain_tensor_warning(self, other_type):
-        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            transforms.ToDtype(dtype={torch.Tensor: torch.float32, other_type: torch.float64})
-
-
 class TestPermuteDimensions:
    @pytest.mark.parametrize(
        ("dims", "inverse_dims"),
        [
            (
-                {datapoints.Image: (2, 1, 0), datapoints.Video: None},
-                {datapoints.Image: (2, 1, 0), datapoints.Video: None},
+                {Image: (2, 1, 0), Video: None},
+                {Image: (2, 1, 0), Video: None},
            ),
            (
-                {datapoints.Image: (2, 1, 0), datapoints.Video: (1, 2, 3, 0)},
-                {datapoints.Image: (2, 1, 0), datapoints.Video: (3, 0, 1, 2)},
+                {Image: (2, 1, 0), Video: (1, 2, 3, 0)},
+                {Image: (2, 1, 0), Video: (3, 0, 1, 2)},
            ),
        ],
    )
    def test_call(self, dims, inverse_dims):
        sample = dict(
            image=make_image(),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
            video=make_video(),
            str="str",
            int=0,
        )

-        transform = proto_transforms.PermuteDimensions(dims)
+        transform = transforms.PermuteDimensions(dims)
        transformed_sample = transform(sample)

        for key, value in sample.items():
            value_type = type(value)
            transformed_value = transformed_sample[key]

-            if check_type(
-                value, (datapoints.Image, torchvision.transforms.v2.utils.is_simple_tensor, datapoints.Video)
-            ):
+            if check_type(value, (Image, is_simple_tensor, Video)):
                if transform.dims.get(value_type) is not None:
                    assert transformed_value.permute(inverse_dims[value_type]).equal(value)
                assert type(transformed_value) == torch.Tensor
@@ -2068,14 +413,14 @@ class TestPermuteDimensions:
    @pytest.mark.filterwarnings("error")
    def test_plain_tensor_call(self):
        tensor = torch.empty((2, 3, 4))
-        transform = proto_transforms.PermuteDimensions(dims=(1, 2, 0))
+        transform = transforms.PermuteDimensions(dims=(1, 2, 0))

        assert transform(tensor).shape == (3, 4, 2)

-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    @pytest.mark.parametrize("other_type", [Image, Video])
    def test_plain_tensor_warning(self, other_type):
        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            proto_transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
+            transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})


 class TestTransposeDimensions:
@@ -2083,19 +428,19 @@ class TestTransposeDimensions:
        "dims",
        [
            (-1, -2),
-            {datapoints.Image: (1, 2), datapoints.Video: None},
+            {Image: (1, 2), Video: None},
        ],
    )
    def test_call(self, dims):
        sample = dict(
            image=make_image(),
-            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
            video=make_video(),
            str="str",
            int=0,
        )

-        transform = proto_transforms.TransposeDimensions(dims)
+        transform = transforms.TransposeDimensions(dims)
        transformed_sample = transform(sample)

        for key, value in sample.items():
@@ -2103,9 +448,7 @@ class TestTransposeDimensions:
            transformed_value = transformed_sample[key]

            transposed_dims = transform.dims.get(value_type)
-            if check_type(
-                value, (datapoints.Image, torchvision.transforms.v2.utils.is_simple_tensor, datapoints.Video)
-            ):
+            if check_type(value, (Image, is_simple_tensor, Video)):
                if transposed_dims is not None:
                    assert transformed_value.transpose(*transposed_dims).equal(value)
                assert type(transformed_value) == torch.Tensor
@@ -2115,372 +458,78 @@ class TestTransposeDimensions:
    @pytest.mark.filterwarnings("error")
    def test_plain_tensor_call(self):
        tensor = torch.empty((2, 3, 4))
-        transform = proto_transforms.TransposeDimensions(dims=(0, 2))
+        transform = transforms.TransposeDimensions(dims=(0, 2))

        assert transform(tensor).shape == (4, 3, 2)

-    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    @pytest.mark.parametrize("other_type", [Image, Video])
    def test_plain_tensor_warning(self, other_type):
        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
-            proto_transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})
-
-
-class TestUniformTemporalSubsample:
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            torch.zeros(10, 3, 8, 8),
-            torch.zeros(1, 10, 3, 8, 8),
-            datapoints.Video(torch.zeros(1, 10, 3, 8, 8)),
-        ],
-    )
-    def test__transform(self, inpt):
-        num_samples = 5
-        transform = transforms.UniformTemporalSubsample(num_samples)
-
-        output = transform(inpt)
-        assert type(output) is type(inpt)
-        assert output.shape[-4] == num_samples
-        assert output.dtype == inpt.dtype
-
-
-# TODO: remove this test in 0.17 when the default of antialias changes to True
-def test_antialias_warning():
-    pil_img = PIL.Image.new("RGB", size=(10, 10), color=127)
-    tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8)
-    tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
-
-    match = "The default value of the antialias parameter"
-    with pytest.warns(UserWarning, match=match):
-        transforms.Resize((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.RandomResizedCrop((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.ScaleJitter((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.RandomShortestSize((20, 20))(tensor_img)
-    with pytest.warns(UserWarning, match=match):
-        transforms.RandomResize(10, 20)(tensor_img)
-
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize(tensor_img, (20, 20))
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize_image_tensor(tensor_img, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize(tensor_video, (20, 20))
-    with pytest.warns(UserWarning, match=match):
-        transforms.functional.resize_video(tensor_video, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Image(tensor_img).resize((20, 20))
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
-
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Video(tensor_video).resize((20, 20))
-    with pytest.warns(UserWarning, match=match):
-        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20))
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("error")
-        transforms.Resize((20, 20))(pil_img)
-        transforms.RandomResizedCrop((20, 20))(pil_img)
-        transforms.ScaleJitter((20, 20))(pil_img)
-        transforms.RandomShortestSize((20, 20))(pil_img)
-        transforms.RandomResize(10, 20)(pil_img)
-        transforms.functional.resize(pil_img, (20, 20))
-
-        transforms.Resize((20, 20), antialias=True)(tensor_img)
-        transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
-        transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
-        transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
-        transforms.RandomResize(10, 20, antialias=True)(tensor_img)
-
-        transforms.functional.resize(tensor_img, (20, 20), antialias=True)
-        transforms.functional.resize_image_tensor(tensor_img, (20, 20), antialias=True)
-        transforms.functional.resize(tensor_video, (20, 20), antialias=True)
-        transforms.functional.resize_video(tensor_video, (20, 20), antialias=True)
-
-        datapoints.Image(tensor_img).resize((20, 20), antialias=True)
-        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
-        datapoints.Video(tensor_video).resize((20, 20), antialias=True)
-        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
-
-
-@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
-@pytest.mark.parametrize("label_type", (torch.Tensor, int))
-@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
-@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
-def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
-
-    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
-    if image_type is PIL.Image:
-        image = to_pil_image(image[0])
-    elif image_type is torch.Tensor:
-        image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
-
-    label = 1 if label_type is int else torch.tensor([1])
-
-    if dataset_return_type is dict:
-        sample = {
-            "image": image,
-            "label": label,
-        }
-    else:
-        sample = image, label
+            transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)})

-    t = transforms.Compose(
-        [
-            transforms.RandomResizedCrop((224, 224)),
-            transforms.RandomHorizontalFlip(p=1),
-            transforms.RandAugment(),
-            transforms.TrivialAugmentWide(),
-            transforms.AugMix(),
-            transforms.AutoAugment(),
-            to_tensor(),
-            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
-            # intended?  This results in a failure if we convert to tensor after
-            # it, because the image would still be uint8 which make Normalize
-            # fail.
-            transforms.ConvertImageDtype(torch.float),
-            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
-            transforms.RandomErasing(p=1),
-        ]
-    )

-    out = t(sample)
+import importlib.machinery
+import importlib.util
+from pathlib import Path

-    assert type(out) == type(sample)

-    if dataset_return_type is tuple:
-        out_image, out_label = out
-    else:
-        assert out.keys() == sample.keys()
-        out_image, out_label = out.values()
+def import_transforms_from_references(reference):
+    HERE = Path(__file__).parent
+    PROJECT_ROOT = HERE.parent

-    assert out_image.shape[-2:] == (224, 224)
-    assert out_label == label
-
-
-@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
-@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
-@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
-@pytest.mark.parametrize("sanitize", (True, False))
-def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
-    torch.manual_seed(0)
-    if data_augmentation == "hflip":
-        t = [
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "lsj":
-        t = [
-            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
-            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
-            # leaving FixedSizeCrop in prototype for now, and it expects Label
-            # classes which we won't release yet.
-            # transforms.FixedSizeCrop(
-            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})
-            # ),
-            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "multiscale":
-        t = [
-            transforms.RandomShortestSize(
-                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
-            ),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "ssd":
-        t = [
-            transforms.RandomPhotometricDistort(p=1),
-            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
-            transforms.RandomIoUCrop(),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    elif data_augmentation == "ssdlite":
-        t = [
-            transforms.RandomIoUCrop(),
-            transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
-            transforms.ConvertImageDtype(torch.float),
-        ]
-    if sanitize:
-        t += [transforms.SanitizeBoundingBoxes()]
-    t = transforms.Compose(t)
-
-    num_boxes = 5
-    H = W = 250
-
-    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
-    if image_type is PIL.Image:
-        image = to_pil_image(image[0])
-    elif image_type is torch.Tensor:
-        image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
-
-    label = torch.randint(0, 10, size=(num_boxes,))
-
-    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
-    boxes[:, 2:] += boxes[:, :2]
-    boxes = boxes.clamp(min=0, max=min(H, W))
-    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
-
-    masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
-
-    sample = {
-        "image": image,
-        "label": label,
-        "boxes": boxes,
-        "masks": masks,
-    }
-
-    out = t(sample)
-
-    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
-        assert is_simple_tensor(out["image"])
-    else:
-        assert isinstance(out["image"], datapoints.Image)
-    assert isinstance(out["label"], type(sample["label"]))
-
-    num_boxes_expected = {
-        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
-        # doesn't remove them strictly speaking, it just marks some boxes as
-        # degenerate and those boxes will be later removed by
-        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
-        # param is True.
-        # Note that the values below are probably specific to the random seed
-        # set above (which is fine).
-        (True, "ssd"): 4,
-        (True, "ssdlite"): 4,
-    }.get((sanitize, data_augmentation), num_boxes)
-
-    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
-
-
-@pytest.mark.parametrize("min_size", (1, 10))
-@pytest.mark.parametrize(
-    "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
-)
-def test_sanitize_bounding_boxes(min_size, labels_getter):
-    H, W = 256, 128
-
-    boxes_and_validity = [
-        ([0, 1, 10, 1], False),  # Y1 == Y2
-        ([0, 1, 0, 20], False),  # X1 == X2
-        ([0, 0, min_size - 1, 10], False),  # H < min_size
-        ([0, 0, 10, min_size - 1], False),  # W < min_size
-        ([0, 0, 10, H + 1], False),  # Y2 > H
-        ([0, 0, W + 1, 10], False),  # X2 > W
-        ([-1, 1, 10, 20], False),  # any < 0
-        ([0, 0, -1, 20], False),  # any < 0
-        ([0, 0, -10, -1], False),  # any < 0
-        ([0, 0, min_size, 10], True),  # H < min_size
-        ([0, 0, 10, min_size], True),  # W < min_size
-        ([0, 0, W, H], True),  # TODO: Is that actually OK?? Should it be -1?
-        ([1, 1, 30, 20], True),
-        ([0, 0, 10, 10], True),
-        ([1, 1, 30, 20], True),
-    ]
-
-    random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
-    boxes, is_valid_mask = zip(*boxes_and_validity)
-    valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
-
-    boxes = torch.tensor(boxes)
-    labels = torch.arange(boxes.shape[0])
-
-    boxes = datapoints.BoundingBox(
-        boxes,
-        format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(H, W),
+    loader = importlib.machinery.SourceFileLoader(
+        "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py")
    )
+    spec = importlib.util.spec_from_loader("transforms", loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module

-    masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
-
-    sample = {
-        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
-        "labels": labels,
-        "boxes": boxes,
-        "whatever": torch.rand(10),
-        "None": None,
-        "masks": masks,
-    }

-    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+det_transforms = import_transforms_from_references("detection")

-    assert out["image"] is sample["image"]
-    assert out["whatever"] is sample["whatever"]

-    if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
-        assert out["labels"] is sample["labels"]
-    else:
-        assert isinstance(out["labels"], torch.Tensor)
-        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
-        # This works because we conveniently set labels to arange(num_boxes)
-        assert out["labels"].tolist() == valid_indices
+def test_fixed_sized_crop_against_detection_reference():
+    def make_datapoints():
+        size = (600, 800)
+        num_objects = 22

+        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+        }

-@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
-def test_sanitize_bounding_boxes_default_heuristic(key):
-    labels = torch.arange(10)
-    d = {key: labels}
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
-
-    if key.lower() != "labels":
-        # If "labels" is in the dict (case-insensitive),
-        # it takes precedence over other keys which would otherwise be a match
-        d = {key: "something_else", "labels": labels}
-        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
-
+        yield (pil_image, target)

-def test_sanitize_bounding_boxes_errors():
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+        }

-    good_bbox = datapoints.BoundingBox(
-        [[0, 0, 10, 10]],
-        format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(20, 20),
-    )
+        yield (tensor_image, target)

-    with pytest.raises(ValueError, match="min_size must be >= 1"):
-        transforms.SanitizeBoundingBoxes(min_size=0)
-    with pytest.raises(ValueError, match="labels_getter should either be a str"):
-        transforms.SanitizeBoundingBoxes(labels_getter=12)
+        datapoint_image = make_image(size=size, color_space="RGB")
+        target = {
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+        }

-    with pytest.raises(ValueError, match="Could not infer where the labels are"):
-        bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+        yield (datapoint_image, target)

-    with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
-        not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
-        transforms.SanitizeBoundingBoxes()(not_a_dict)
+    t = transforms.FixedSizeCrop((1024, 1024), fill=0)
+    t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0)

-    with pytest.raises(ValueError, match="must be a tensor"):
-        not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
-        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+    for dp in make_datapoints():
+        # We should use prototype transform first as reference transform performs inplace target update
+        torch.manual_seed(12)
+        output = t(dp)

-    with pytest.raises(ValueError, match="Number of boxes"):
-        different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        torch.manual_seed(12)
+        expected_output = t_ref(*dp)

-    with pytest.raises(ValueError, match="boxes must be of shape"):
-        bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
-            [
-                [[0, 0, 10, 10]],
-                [[0, 0, 10, 10]],
-            ],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(20, 20),
-        )
-        different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        assert_equal(expected_output, output)
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
+import itertools
+import pathlib
+import random
+import re
+import warnings
+from collections import defaultdict
+
+import numpy as np
+
+import PIL.Image
+import pytest
+import torch
+import torchvision.transforms.v2 as transforms
+
+from common_utils import (
+    assert_equal,
+    cpu_and_gpu,
+    make_bounding_box,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_segmentation_mask,
+    make_video,
+    make_videos,
+)
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import datapoints
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
+
+
+def make_vanilla_tensor_images(*args, **kwargs):
+    for image in make_images(*args, **kwargs):
+        if image.ndim > 3:
+            continue
+        yield image.data
+
+
+def make_pil_images(*args, **kwargs):
+    for image in make_vanilla_tensor_images(*args, **kwargs):
+        yield to_pil_image(image)
+
+
+def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
+    for bounding_box in make_bounding_boxes(*args, **kwargs):
+        yield bounding_box.data
+
+
+def parametrize(transforms_with_inputs):
+    return pytest.mark.parametrize(
+        ("transform", "input"),
+        [
+            pytest.param(
+                transform,
+                input,
+                id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}",
+            )
+            for transform, inputs in transforms_with_inputs
+            for idx, input in enumerate(inputs)
+        ],
+    )
+
+
+def auto_augment_adapter(transform, input, device):
+    adapted_input = {}
+    image_or_video_found = False
+    for key, value in input.items():
+        if isinstance(value, (datapoints.BoundingBox, datapoints.Mask)):
+            # AA transforms don't support bounding boxes or masks
+            continue
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
+            if image_or_video_found:
+                # AA transforms only support a single image or video
+                continue
+            image_or_video_found = True
+        adapted_input[key] = value
+    return adapted_input
+
+
+def linear_transformation_adapter(transform, input, device):
+    flat_inputs = list(input.values())
+    c, h, w = query_chw(
+        [
+            item
+            for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs))
+            if needs_transform
+        ]
+    )
+    num_elements = c * h * w
+    transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device)
+    transform.mean_vector = torch.randn((num_elements,), device=device)
+    return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
+
+
+def normalize_adapter(transform, input, device):
+    adapted_input = {}
+    for key, value in input.items():
+        if isinstance(value, PIL.Image.Image):
+            # normalize doesn't support PIL images
+            continue
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
+            # normalize doesn't support integer images
+            value = F.convert_dtype(value, torch.float32)
+        adapted_input[key] = value
+    return adapted_input
+
+
+class TestSmoke:
+    @pytest.mark.parametrize(
+        ("transform", "adapter"),
+        [
+            (transforms.RandomErasing(p=1.0), None),
+            (transforms.AugMix(), auto_augment_adapter),
+            (transforms.AutoAugment(), auto_augment_adapter),
+            (transforms.RandAugment(), auto_augment_adapter),
+            (transforms.TrivialAugmentWide(), auto_augment_adapter),
+            (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
+            (transforms.Grayscale(), None),
+            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
+            (transforms.RandomAutocontrast(p=1.0), None),
+            (transforms.RandomEqualize(p=1.0), None),
+            (transforms.RandomGrayscale(p=1.0), None),
+            (transforms.RandomInvert(p=1.0), None),
+            (transforms.RandomPhotometricDistort(p=1.0), None),
+            (transforms.RandomPosterize(bits=4, p=1.0), None),
+            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
+            (transforms.CenterCrop([16, 16]), None),
+            (transforms.ElasticTransform(sigma=1.0), None),
+            (transforms.Pad(4), None),
+            (transforms.RandomAffine(degrees=30.0), None),
+            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
+            (transforms.RandomHorizontalFlip(p=1.0), None),
+            (transforms.RandomPerspective(p=1.0), None),
+            (transforms.RandomResize(min_size=10, max_size=20), None),
+            (transforms.RandomResizedCrop([16, 16]), None),
+            (transforms.RandomRotation(degrees=30), None),
+            (transforms.RandomShortestSize(min_size=10), None),
+            (transforms.RandomVerticalFlip(p=1.0), None),
+            (transforms.RandomZoomOut(p=1.0), None),
+            (transforms.Resize([16, 16], antialias=True), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
+            (transforms.ClampBoundingBox(), None),
+            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
+            (transforms.ConvertDtype(), None),
+            (transforms.GaussianBlur(kernel_size=3), None),
+            (
+                transforms.LinearTransformation(
+                    # These are just dummy values that will be filled by the adapter. We can't define them upfront,
+                    # because for we neither know the spatial size nor the device at this point
+                    transformation_matrix=torch.empty((1, 1)),
+                    mean_vector=torch.empty((1,)),
+                ),
+                linear_transformation_adapter,
+            ),
+            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
+            (transforms.ToDtype(torch.float64), None),
+            (transforms.UniformTemporalSubsample(num_samples=2), None),
+        ],
+        ids=lambda transform: type(transform).__name__,
+    )
+    @pytest.mark.parametrize("container_type", [dict, list, tuple])
+    @pytest.mark.parametrize(
+        "image_or_video",
+        [
+            make_image(),
+            make_video(),
+            next(make_pil_images(color_spaces=["RGB"])),
+            next(make_vanilla_tensor_images()),
+        ],
+    )
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_common(self, transform, adapter, container_type, image_or_video, device):
+        spatial_size = F.get_spatial_size(image_or_video)
+        input = dict(
+            image_or_video=image_or_video,
+            image_datapoint=make_image(size=spatial_size),
+            video_datapoint=make_video(size=spatial_size),
+            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
+            bounding_box_xyxy=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
+            ),
+            bounding_box_xywh=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
+            ),
+            bounding_box_cxcywh=make_bounding_box(
+                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
+            ),
+            bounding_box_degenerate_xyxy=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [2, 0, 1, 1],  # x1 > x2, y1 < y2
+                    [0, 2, 1, 1],  # x1 < x2, y1 > y2
+                    [2, 2, 1, 1],  # x1 > x2, y1 > y2
+                ],
+                format=datapoints.BoundingBoxFormat.XYXY,
+                spatial_size=spatial_size,
+            ),
+            bounding_box_degenerate_xywh=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=datapoints.BoundingBoxFormat.XYWH,
+                spatial_size=spatial_size,
+            ),
+            bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=datapoints.BoundingBoxFormat.CXCYWH,
+                spatial_size=spatial_size,
+            ),
+            detection_mask=make_detection_mask(size=spatial_size),
+            segmentation_mask=make_segmentation_mask(size=spatial_size),
+            int=0,
+            float=0.0,
+            bool=True,
+            none=None,
+            str="str",
+            path=pathlib.Path.cwd(),
+            object=object(),
+            tensor=torch.empty(5),
+            array=np.empty(5),
+        )
+        if adapter is not None:
+            input = adapter(transform, input, device)
+
+        if container_type in {tuple, list}:
+            input = container_type(input.values())
+
+        input_flat, input_spec = tree_flatten(input)
+        input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat]
+        input = tree_unflatten(input_flat, input_spec)
+
+        torch.manual_seed(0)
+        output = transform(input)
+        output_flat, output_spec = tree_flatten(output)
+
+        assert output_spec == input_spec
+
+        for output_item, input_item, should_be_transformed in zip(
+            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
+        ):
+            if should_be_transformed:
+                assert type(output_item) is type(input_item)
+            else:
+                assert output_item is input_item
+
+            if isinstance(input_item, datapoints.BoundingBox) and not isinstance(
+                transform, transforms.ConvertBoundingBoxFormat
+            ):
+                assert output_item.format == input_item.format
+
+        # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
+        # transform that does this), back into a valid one.
+        # TODO: we should test that against all degenerate boxes above
+        for format in list(datapoints.BoundingBoxFormat):
+            sample = dict(
+                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
+                labels=torch.tensor([3]),
+            )
+            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+
+    @parametrize(
+        [
+            (
+                transform,
+                itertools.chain.from_iterable(
+                    fn(
+                        color_spaces=[
+                            "GRAY",
+                            "RGB",
+                        ],
+                        dtypes=[torch.uint8],
+                        extra_dims=[(), (4,)],
+                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
+                    )
+                    for fn in [
+                        make_images,
+                        make_vanilla_tensor_images,
+                        make_pil_images,
+                        make_videos,
+                    ]
+                ),
+            )
+            for transform in (
+                transforms.RandAugment(),
+                transforms.TrivialAugmentWide(),
+                transforms.AutoAugment(),
+                transforms.AugMix(),
+            )
+        ]
+    )
+    def test_auto_augment(self, transform, input):
+        transform(input)
+
+    @parametrize(
+        [
+            (
+                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
+                itertools.chain.from_iterable(
+                    fn(color_spaces=["RGB"], dtypes=[torch.float32])
+                    for fn in [
+                        make_images,
+                        make_vanilla_tensor_images,
+                        make_videos,
+                    ]
+                ),
+            ),
+        ]
+    )
+    def test_normalize(self, transform, input):
+        transform(input)
+
+    @parametrize(
+        [
+            (
+                transforms.RandomResizedCrop([16, 16], antialias=True),
+                itertools.chain(
+                    make_images(extra_dims=[(4,)]),
+                    make_vanilla_tensor_images(),
+                    make_pil_images(),
+                    make_videos(extra_dims=[()]),
+                ),
+            )
+        ]
+    )
+    def test_random_resized_crop(self, transform, input):
+        transform(input)
+
+
+@pytest.mark.parametrize(
+    "flat_inputs",
+    itertools.permutations(
+        [
+            next(make_vanilla_tensor_images()),
+            next(make_vanilla_tensor_images()),
+            next(make_pil_images()),
+            make_image(),
+            next(make_videos()),
+        ],
+        3,
+    ),
+)
+def test_simple_tensor_heuristic(flat_inputs):
+    def split_on_simple_tensor(to_split):
+        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
+        # 1. The first simple tensor. If none is present, this will be `None`
+        # 2. A list of the remaining simple tensors
+        # 3. A list of all other items
+        simple_tensors = []
+        others = []
+        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
+        # affect the splitting.
+        for item, inpt in zip(to_split, flat_inputs):
+            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
+        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
+
+    class CopyCloneTransform(transforms.Transform):
+        def _transform(self, inpt, params):
+            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
+
+        @staticmethod
+        def was_applied(output, inpt):
+            identity = output is inpt
+            if identity:
+                return False
+
+            # Make sure nothing fishy is going on
+            assert_equal(output, inpt)
+            return True
+
+    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
+
+    transform = CopyCloneTransform()
+    transformed_sample = transform(flat_inputs)
+
+    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
+
+    if first_simple_tensor_input is not None:
+        if other_inputs:
+            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+        else:
+            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+
+    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
+        assert not transform.was_applied(output, inpt)
+
+    for input, output in zip(other_inputs, other_outputs):
+        assert transform.was_applied(output, input)
+
+
+@pytest.mark.parametrize("p", [0.0, 1.0])
+class TestRandomHorizontalFlip:
+    def input_expected_image_tensor(self, p, dtype=torch.float32):
+        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
+        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
+
+        return input, expected if p == 1 else input
+
+    def test_simple_tensor(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(input)
+
+        assert_equal(expected, actual)
+
+    def test_pil_image(self, p):
+        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(to_pil_image(input))
+
+        assert_equal(expected, pil_to_tensor(actual))
+
+    def test_datapoints_image(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(datapoints.Image(input))
+
+        assert_equal(datapoints.Image(expected), actual)
+
+    def test_datapoints_mask(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(datapoints.Mask(input))
+
+        assert_equal(datapoints.Mask(expected), actual)
+
+    def test_datapoints_bounding_box(self, p):
+        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
+        transform = transforms.RandomHorizontalFlip(p=p)
+
+        actual = transform(input)
+
+        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
+        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
+        assert_equal(expected, actual)
+        assert actual.format == expected.format
+        assert actual.spatial_size == expected.spatial_size
+
+
+@pytest.mark.parametrize("p", [0.0, 1.0])
+class TestRandomVerticalFlip:
+    def input_expected_image_tensor(self, p, dtype=torch.float32):
+        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
+        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
+
+        return input, expected if p == 1 else input
+
+    def test_simple_tensor(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(input)
+
+        assert_equal(expected, actual)
+
+    def test_pil_image(self, p):
+        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(to_pil_image(input))
+
+        assert_equal(expected, pil_to_tensor(actual))
+
+    def test_datapoints_image(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(datapoints.Image(input))
+
+        assert_equal(datapoints.Image(expected), actual)
+
+    def test_datapoints_mask(self, p):
+        input, expected = self.input_expected_image_tensor(p)
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(datapoints.Mask(input))
+
+        assert_equal(datapoints.Mask(expected), actual)
+
+    def test_datapoints_bounding_box(self, p):
+        input = datapoints.BoundingBox([0, 0, 5, 5], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
+        transform = transforms.RandomVerticalFlip(p=p)
+
+        actual = transform(input)
+
+        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
+        expected = datapoints.BoundingBox.wrap_like(input, expected_image_tensor)
+        assert_equal(expected, actual)
+        assert actual.format == expected.format
+        assert actual.spatial_size == expected.spatial_size
+
+
+class TestPad:
+    def test_assertions(self):
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.Pad("abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.Pad([-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.Pad(12, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.Pad(12, padding_mode="abc")
+
+    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
+    def test__transform(self, padding, fill, padding_mode, mocker):
+        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        _ = transform(inpt)
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        if isinstance(padding, tuple):
+            padding = list(padding)
+        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
+    def test__transform_image_mask(self, fill, mocker):
+        transform = transforms.Pad(1, fill=fill, padding_mode="constant")
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        image = datapoints.Image(torch.rand(3, 32, 32))
+        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
+        inpt = [image, mask]
+        _ = transform(inpt)
+
+        if isinstance(fill, int):
+            fill = transforms._utils._convert_fill_arg(fill)
+            calls = [
+                mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
+                mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
+            ]
+        else:
+            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
+            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
+            calls = [
+                mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
+                mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
+            ]
+        fn.assert_has_calls(calls)
+
+
+class TestRandomZoomOut:
+    def test_assertions(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomZoomOut(fill="abc")
+
+        with pytest.raises(TypeError, match="should be a sequence of length"):
+            transforms.RandomZoomOut(0, side_range=0)
+
+        with pytest.raises(ValueError, match="Invalid canvas side range"):
+            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
+
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
+    def test__get_params(self, fill, side_range, mocker):
+        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
+
+        image = mocker.MagicMock(spec=datapoints.Image)
+        h, w = image.spatial_size = (24, 32)
+
+        params = transform._get_params([image])
+
+        assert len(params["padding"]) == 4
+        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
+        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
+        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
+        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
+
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
+    def test__transform(self, fill, side_range, mocker):
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        torch.rand(1)  # random apply changes random state
+        params = transform._get_params([inpt])
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, fill=fill)
+
+    @pytest.mark.parametrize("fill", [12, {datapoints.Image: 12, datapoints.Mask: 34}])
+    def test__transform_image_mask(self, fill, mocker):
+        transform = transforms.RandomZoomOut(fill=fill, p=1.0)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.pad")
+        image = datapoints.Image(torch.rand(3, 32, 32))
+        mask = datapoints.Mask(torch.randint(0, 5, size=(32, 32)))
+        inpt = [image, mask]
+
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        torch.rand(1)  # random apply changes random state
+        params = transform._get_params(inpt)
+
+        if isinstance(fill, int):
+            fill = transforms._utils._convert_fill_arg(fill)
+            calls = [
+                mocker.call(image, **params, fill=fill),
+                mocker.call(mask, **params, fill=fill),
+            ]
+        else:
+            fill_img = transforms._utils._convert_fill_arg(fill[type(image)])
+            fill_mask = transforms._utils._convert_fill_arg(fill[type(mask)])
+            calls = [
+                mocker.call(image, **params, fill=fill_img),
+                mocker.call(mask, **params, fill=fill_mask),
+            ]
+        fn.assert_has_calls(calls)
+
+
+class TestRandomRotation:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="is a single number, it must be positive"):
+            transforms.RandomRotation(-0.7)
+
+        for d in [[-0.7], [-0.7, 0, 0.7]]:
+            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
+                transforms.RandomRotation(d)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomRotation(12, fill="abc")
+
+        with pytest.raises(TypeError, match="center should be a sequence of length"):
+            transforms.RandomRotation(12, center=12)
+
+        with pytest.raises(ValueError, match="center should be a sequence of length"):
+            transforms.RandomRotation(12, center=[1, 2, 3])
+
+    def test__get_params(self):
+        angle_bound = 34
+        transform = transforms.RandomRotation(angle_bound)
+
+        params = transform._get_params(None)
+        assert -angle_bound <= params["angle"] <= angle_bound
+
+        angle_bounds = [12, 34]
+        transform = transforms.RandomRotation(angle_bounds)
+
+        params = transform._get_params(None)
+        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
+
+    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
+    def test__transform(self, degrees, expand, fill, center, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        transform = transforms.RandomRotation(
+            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
+        )
+
+        if isinstance(degrees, (tuple, list)):
+            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
+        else:
+            assert transform.degrees == [float(-degrees), float(degrees)]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.rotate")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params(inpt)
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+    @pytest.mark.parametrize("angle", [34, -87])
+    @pytest.mark.parametrize("expand", [False, True])
+    def test_boundingbox_spatial_size(self, angle, expand):
+        # Specific test for BoundingBox.rotate
+        bbox = datapoints.BoundingBox(
+            torch.tensor([1, 2, 3, 4]), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
+        )
+        img = datapoints.Image(torch.rand(1, 3, 32, 32))
+
+        out_img = img.rotate(angle, expand=expand)
+        out_bbox = bbox.rotate(angle, expand=expand)
+
+        assert out_img.spatial_size == out_bbox.spatial_size
+
+
+class TestRandomAffine:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="is a single number, it must be positive"):
+            transforms.RandomAffine(-0.7)
+
+        for d in [[-0.7], [-0.7, 0, 0.7]]:
+            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
+                transforms.RandomAffine(d)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(12, fill="abc")
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(12, fill="abc")
+
+        for kwargs in [
+            {"center": 12},
+            {"translate": 12},
+            {"scale": 12},
+        ]:
+            with pytest.raises(TypeError, match="should be a sequence of length"):
+                transforms.RandomAffine(12, **kwargs)
+
+        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
+            with pytest.raises(ValueError, match="should be a sequence of length"):
+                transforms.RandomAffine(12, **kwargs)
+
+        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
+            transforms.RandomAffine(12, translate=[-1.0, 2.0])
+
+        with pytest.raises(ValueError, match="scale values should be positive"):
+            transforms.RandomAffine(12, scale=[-1.0, 2.0])
+
+        with pytest.raises(ValueError, match="is a single number, it must be positive"):
+            transforms.RandomAffine(12, shear=-10)
+
+        for s in [[-0.7], [-0.7, 0, 0.7]]:
+            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
+                transforms.RandomAffine(12, shear=s)
+
+    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
+    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
+    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
+    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
+    def test__get_params(self, degrees, translate, scale, shear, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+        h, w = image.spatial_size
+
+        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
+        params = transform._get_params([image])
+
+        if not isinstance(degrees, (list, tuple)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+        if translate is not None:
+            w_max = int(round(translate[0] * w))
+            h_max = int(round(translate[1] * h))
+            assert -w_max <= params["translate"][0] <= w_max
+            assert -h_max <= params["translate"][1] <= h_max
+        else:
+            assert params["translate"] == (0, 0)
+
+        if scale is not None:
+            assert scale[0] <= params["scale"] <= scale[1]
+        else:
+            assert params["scale"] == 1.0
+
+        if shear is not None:
+            if isinstance(shear, float):
+                assert -shear <= params["shear"][0] <= shear
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 2:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert params["shear"][1] == 0.0
+            else:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert shear[2] <= params["shear"][1] <= shear[3]
+        else:
+            assert params["shear"] == (0, 0)
+
+    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
+    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
+    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
+    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
+    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        transform = transforms.RandomAffine(
+            degrees,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+
+        if isinstance(degrees, (tuple, list)):
+            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
+        else:
+            assert transform.degrees == [float(-degrees), float(degrees)]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.affine")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params([inpt])
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
+
+
+class TestRandomCrop:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Please provide only two dimensions"):
+            transforms.RandomCrop([10, 12, 14])
+
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.RandomCrop([10, 12], padding="abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomCrop([10, 12], padding=1, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
+
+    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
+    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
+    def test__get_params(self, padding, pad_if_needed, size, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+        h, w = image.spatial_size
+
+        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
+        params = transform._get_params([image])
+
+        if padding is not None:
+            if isinstance(padding, int):
+                pad_top = pad_bottom = pad_left = pad_right = padding
+            elif isinstance(padding, list) and len(padding) == 2:
+                pad_left = pad_right = padding[0]
+                pad_top = pad_bottom = padding[1]
+            elif isinstance(padding, list) and len(padding) == 4:
+                pad_left, pad_top, pad_right, pad_bottom = padding
+
+            h += pad_top + pad_bottom
+            w += pad_left + pad_right
+        else:
+            pad_left = pad_right = pad_top = pad_bottom = 0
+
+        if pad_if_needed:
+            if w < size[1]:
+                diff = size[1] - w
+                pad_left += diff
+                pad_right += diff
+                w += 2 * diff
+            if h < size[0]:
+                diff = size[0] - h
+                pad_top += diff
+                pad_bottom += diff
+                h += 2 * diff
+
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+
+        assert 0 <= params["top"] <= h - size[0] + 1
+        assert 0 <= params["left"] <= w - size[1] + 1
+        assert params["height"] == size[0]
+        assert params["width"] == size[1]
+        assert params["needs_pad"] is any(padding)
+        assert params["padding"] == padding
+
+    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
+    @pytest.mark.parametrize("pad_if_needed", [False, True])
+    @pytest.mark.parametrize("fill", [False, True])
+    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
+    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
+        output_size = [10, 12]
+        transform = transforms.RandomCrop(
+            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
+        )
+
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (32, 32)
+
+        expected = mocker.MagicMock(spec=datapoints.Image)
+        expected.num_channels = 3
+        if isinstance(padding, int):
+            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
+        elif isinstance(padding, list):
+            expected.spatial_size = (
+                inpt.spatial_size[0] + sum(padding[0::2]),
+                inpt.spatial_size[1] + sum(padding[1::2]),
+            )
+        else:
+            expected.spatial_size = inpt.spatial_size
+        _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
+        fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
+
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params([inpt])
+        if padding is None and not pad_if_needed:
+            fn_crop.assert_called_once_with(
+                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
+            )
+        elif not pad_if_needed:
+            fn_crop.assert_called_once_with(
+                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
+            )
+        elif padding is None:
+            # vfdev-5: I do not know how to mock and test this case
+            pass
+        else:
+            # vfdev-5: I do not know how to mock and test this case
+            pass
+
+
+class TestGaussianBlur:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
+            transforms.GaussianBlur([10, 12, 14])
+
+        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
+            transforms.GaussianBlur(4)
+
+        with pytest.raises(
+            TypeError, match="sigma should be a single int or float or a list/tuple with length 2 floats."
+        ):
+            transforms.GaussianBlur(3, sigma=[1, 2, 3])
+
+        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
+            transforms.GaussianBlur(3, sigma=-1.0)
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
+
+    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
+    def test__get_params(self, sigma):
+        transform = transforms.GaussianBlur(3, sigma=sigma)
+        params = transform._get_params([])
+
+        if isinstance(sigma, float):
+            assert params["sigma"][0] == params["sigma"][1] == 10
+        else:
+            assert sigma[0] <= params["sigma"][0] <= sigma[1]
+            assert sigma[0] <= params["sigma"][1] <= sigma[1]
+
+    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
+    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
+    def test__transform(self, kernel_size, sigma, mocker):
+        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
+
+        if isinstance(kernel_size, (tuple, list)):
+            assert transform.kernel_size == kernel_size
+        else:
+            kernel_size = (kernel_size, kernel_size)
+            assert transform.kernel_size == kernel_size
+
+        if isinstance(sigma, (tuple, list)):
+            assert transform.sigma == sigma
+        else:
+            assert transform.sigma == [sigma, sigma]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        params = transform._get_params([inpt])
+
+        fn.assert_called_once_with(inpt, kernel_size, **params)
+
+
+class TestRandomColorOp:
+    @pytest.mark.parametrize("p", [0.0, 1.0])
+    @pytest.mark.parametrize(
+        "transform_cls, func_op_name, kwargs",
+        [
+            (transforms.RandomEqualize, "equalize", {}),
+            (transforms.RandomInvert, "invert", {}),
+            (transforms.RandomAutocontrast, "autocontrast", {}),
+            (transforms.RandomPosterize, "posterize", {"bits": 4}),
+            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
+            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
+        ],
+    )
+    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
+        transform = transform_cls(p=p, **kwargs)
+
+        fn = mocker.patch(f"torchvision.transforms.v2.functional.{func_op_name}")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        _ = transform(inpt)
+        if p > 0.0:
+            fn.assert_called_once_with(inpt, **kwargs)
+        else:
+            assert fn.call_count == 0
+
+
+class TestRandomPerspective:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
+            transforms.RandomPerspective(distortion_scale=-1.0)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomPerspective(0.5, fill="abc")
+
+    def test__get_params(self, mocker):
+        dscale = 0.5
+        transform = transforms.RandomPerspective(dscale)
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        params = transform._get_params([image])
+
+        h, w = image.spatial_size
+        assert "coefficients" in params
+        assert len(params["coefficients"]) == 8
+
+    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
+    def test__transform(self, distortion_scale, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        fill = 12
+        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+        # vfdev-5, Feature Request: let's store params as Transform attribute
+        # This could be also helpful for users
+        # Otherwise, we can mock transform._get_params
+        torch.manual_seed(12)
+        _ = transform(inpt)
+        torch.manual_seed(12)
+        torch.rand(1)  # random apply changes random state
+        params = transform._get_params([inpt])
+
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
+
+
+class TestElasticTransform:
+    def test_assertions(self):
+
+        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
+            transforms.ElasticTransform({})
+
+        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
+            transforms.ElasticTransform([1.0, 2.0, 3.0])
+
+        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
+            transforms.ElasticTransform([1, 2])
+
+        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
+            transforms.ElasticTransform(1.0, {})
+
+        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
+            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
+
+        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
+            transforms.ElasticTransform(1.0, [1, 2])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.ElasticTransform(1.0, 2.0, fill="abc")
+
+    def test__get_params(self, mocker):
+        alpha = 2.0
+        sigma = 3.0
+        transform = transforms.ElasticTransform(alpha, sigma)
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        params = transform._get_params([image])
+
+        h, w = image.spatial_size
+        displacement = params["displacement"]
+        assert displacement.shape == (1, h, w, 2)
+        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
+        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
+
+    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
+    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
+    def test__transform(self, alpha, sigma, mocker):
+        interpolation = InterpolationMode.BILINEAR
+        fill = 12
+        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
+
+        if isinstance(alpha, float):
+            assert transform.alpha == [alpha, alpha]
+        else:
+            assert transform.alpha == alpha
+
+        if isinstance(sigma, float):
+            assert transform.sigma == [sigma, sigma]
+        else:
+            assert transform.sigma == sigma
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
+        inpt = mocker.MagicMock(spec=datapoints.Image)
+        inpt.num_channels = 3
+        inpt.spatial_size = (24, 32)
+
+        # Let's mock transform._get_params to control the output:
+        transform._get_params = mocker.MagicMock()
+        _ = transform(inpt)
+        params = transform._get_params([inpt])
+        fill = transforms._utils._convert_fill_arg(fill)
+        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
+
+
+class TestRandomErasing:
+    def test_assertions(self, mocker):
+        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
+            transforms.RandomErasing(value={})
+
+        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
+            transforms.RandomErasing(value="abc")
+
+        with pytest.raises(TypeError, match="Scale should be a sequence"):
+            transforms.RandomErasing(scale=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence"):
+            transforms.RandomErasing(ratio=123)
+
+        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
+            transforms.RandomErasing(scale=[-1, 2])
+
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
+
+        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
+            transform._get_params([image])
+
+    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
+    def test__get_params(self, value, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+
+        transform = transforms.RandomErasing(value=value)
+        params = transform._get_params([image])
+
+        v = params["v"]
+        h, w = params["h"], params["w"]
+        i, j = params["i"], params["j"]
+        assert isinstance(v, torch.Tensor)
+        if value == "random":
+            assert v.shape == (image.num_channels, h, w)
+        elif isinstance(value, (int, float)):
+            assert v.shape == (1, 1, 1)
+        elif isinstance(value, (list, tuple)):
+            assert v.shape == (image.num_channels, 1, 1)
+
+        assert 0 <= i <= image.spatial_size[0] - h
+        assert 0 <= j <= image.spatial_size[1] - w
+
+    @pytest.mark.parametrize("p", [0, 1])
+    def test__transform(self, mocker, p):
+        transform = transforms.RandomErasing(p=p)
+        transform._transformed_types = (mocker.MagicMock,)
+
+        i_sentinel = mocker.MagicMock()
+        j_sentinel = mocker.MagicMock()
+        h_sentinel = mocker.MagicMock()
+        w_sentinel = mocker.MagicMock()
+        v_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._augment.RandomErasing._get_params",
+            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock = mocker.patch("torchvision.transforms.v2._augment.F.erase")
+        output = transform(inpt_sentinel)
+
+        if p:
+            mock.assert_called_once_with(
+                inpt_sentinel,
+                i=i_sentinel,
+                j=j_sentinel,
+                h=h_sentinel,
+                w=w_sentinel,
+                v=v_sentinel,
+                inplace=transform.inplace,
+            )
+        else:
+            mock.assert_not_called()
+            assert output is inpt_sentinel
+
+
+class TestTransform:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test_check_transformed_types(self, inpt_type, mocker):
+        # This test ensures that we correctly handle which types to transform and which to bypass
+        t = transforms.Transform()
+        inpt = mocker.MagicMock(spec=inpt_type)
+
+        if inpt_type in (np.ndarray, str, int):
+            output = t(inpt)
+            assert output is inpt
+        else:
+            with pytest.raises(NotImplementedError):
+                t(inpt)
+
+
+class TestToImageTensor:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch(
+            "torchvision.transforms.v2.functional.to_image_tensor",
+            return_value=torch.rand(1, 3, 8, 8),
+        )
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToImageTensor()
+        transform(inpt)
+        if inpt_type in (datapoints.BoundingBox, datapoints.Image, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt)
+
+
+class TestToImagePIL:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToImagePIL()
+        transform(inpt)
+        if inpt_type in (datapoints.BoundingBox, PIL.Image.Image, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt, mode=transform.mode)
+
+
+class TestToPILImage:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_image_pil")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToPILImage()
+        transform(inpt)
+        if inpt_type in (PIL.Image.Image, datapoints.BoundingBox, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt, mode=transform.mode)
+
+
+class TestToTensor:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBox, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            transform = transforms.ToTensor()
+        transform(inpt)
+        if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBox, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt)
+
+
+class TestContainers:
+    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
+    def test_assertions(self, transform_cls):
+        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
+            transform_cls(transforms.RandomCrop(28))
+
+    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
+    @pytest.mark.parametrize(
+        "trfms",
+        [
+            [transforms.Pad(2), transforms.RandomCrop(28)],
+            [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
+            [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)],
+        ],
+    )
+    def test_ctor(self, transform_cls, trfms):
+        c = transform_cls(trfms)
+        inpt = torch.rand(1, 3, 32, 32)
+        output = c(inpt)
+        assert isinstance(output, torch.Tensor)
+        assert output.ndim == 4
+
+
+class TestRandomChoice:
+    def test_assertions(self):
+        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
+
+        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
+
+
+class TestRandomIoUCrop:
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
+    def test__get_params(self, device, options, mocker):
+        image = mocker.MagicMock(spec=datapoints.Image)
+        image.num_channels = 3
+        image.spatial_size = (24, 32)
+        bboxes = datapoints.BoundingBox(
+            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
+            format="XYXY",
+            spatial_size=image.spatial_size,
+            device=device,
+        )
+        sample = [image, bboxes]
+
+        transform = transforms.RandomIoUCrop(sampler_options=options)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform._get_params(sample)
+
+            if options == [2.0]:
+                assert len(params) == 0
+                return
+
+            assert len(params["is_within_crop_area"]) > 0
+            assert params["is_within_crop_area"].dtype == torch.bool
+
+            orig_h = image.spatial_size[0]
+            orig_w = image.spatial_size[1]
+            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
+            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
+
+            left, top = params["left"], params["top"]
+            new_h, new_w = params["height"], params["width"]
+            ious = box_iou(
+                bboxes,
+                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
+            )
+            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
+
+    def test__transform_empty_params(self, mocker):
+        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
+        image = datapoints.Image(torch.rand(1, 3, 4, 4))
+        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
+        label = torch.tensor([1])
+        sample = [image, bboxes, label]
+        # Let's mock transform._get_params to control the output:
+        transform._get_params = mocker.MagicMock(return_value={})
+        output = transform(sample)
+        torch.testing.assert_close(output, sample)
+
+    def test_forward_assertion(self):
+        transform = transforms.RandomIoUCrop()
+        with pytest.raises(
+            TypeError,
+            match="requires input sample to contain tensor or PIL images and bounding boxes",
+        ):
+            transform(torch.tensor(0))
+
+    def test__transform(self, mocker):
+        transform = transforms.RandomIoUCrop()
+
+        image = datapoints.Image(torch.rand(3, 32, 24))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
+        masks = make_detection_mask((32, 24), num_objects=6)
+
+        sample = [image, bboxes, masks]
+
+        fn = mocker.patch("torchvision.transforms.v2.functional.crop", side_effect=lambda x, **params: x)
+        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
+
+        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
+        transform._get_params = mocker.MagicMock(return_value=params)
+        output = transform(sample)
+
+        assert fn.call_count == 3
+
+        expected_calls = [
+            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
+            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
+            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
+        ]
+
+        fn.assert_has_calls(expected_calls)
+
+        # check number of bboxes vs number of labels:
+        output_bboxes = output[1]
+        assert isinstance(output_bboxes, datapoints.BoundingBox)
+        assert (output_bboxes[~is_within_crop_area] == 0).all()
+
+        output_masks = output[2]
+        assert isinstance(output_masks, datapoints.Mask)
+
+
+class TestScaleJitter:
+    def test__get_params(self, mocker):
+        spatial_size = (24, 32)
+        target_size = (16, 12)
+        scale_range = (0.5, 1.5)
+
+        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
+        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform._get_params([sample])
+
+            assert "size" in params
+            size = params["size"]
+
+            assert isinstance(size, tuple) and len(size) == 2
+            height, width = size
+
+            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
+
+            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
+            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
+
+    def test__transform(self, mocker):
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
+        antialias_sentinel = mocker.MagicMock()
+
+        transform = transforms.ScaleJitter(
+            target_size=(16, 12), interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+        transform._transformed_types = (mocker.MagicMock,)
+
+        size_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
+        transform(inpt_sentinel)
+
+        mock.assert_called_once_with(
+            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+
+
+class TestRandomShortestSize:
+    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
+    def test__get_params(self, min_size, max_size, mocker):
+        spatial_size = (3, 10)
+
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
+
+        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+        params = transform._get_params([sample])
+
+        assert "size" in params
+        size = params["size"]
+
+        assert isinstance(size, tuple) and len(size) == 2
+
+        longer = max(size)
+        shorter = min(size)
+        if max_size is not None:
+            assert longer <= max_size
+            assert shorter <= max_size
+        else:
+            assert shorter in min_size
+
+    def test__transform(self, mocker):
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
+        antialias_sentinel = mocker.MagicMock()
+
+        transform = transforms.RandomShortestSize(
+            min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+        transform._transformed_types = (mocker.MagicMock,)
+
+        size_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._geometry.RandomShortestSize._get_params",
+            return_value=dict(size=size_sentinel),
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
+        transform(inpt_sentinel)
+
+        mock.assert_called_once_with(
+            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+
+
+class TestLinearTransformation:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="transformation_matrix should be square"):
+            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
+
+        with pytest.raises(ValueError, match="mean_vector should have the same length"):
+            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
+
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            122 * torch.ones(1, 3, 8, 8),
+            122.0 * torch.ones(1, 3, 8, 8),
+            datapoints.Image(122 * torch.ones(1, 3, 8, 8)),
+            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
+        ],
+    )
+    def test__transform(self, inpt):
+
+        v = 121 * torch.ones(3 * 8 * 8)
+        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
+        transform = transforms.LinearTransformation(m, v)
+
+        if isinstance(inpt, PIL.Image.Image):
+            with pytest.raises(TypeError, match="LinearTransformation does not work on PIL Images"):
+                transform(inpt)
+        else:
+            output = transform(inpt)
+            assert isinstance(output, torch.Tensor)
+            assert output.unique() == 3 * 8 * 8
+            assert output.dtype == inpt.dtype
+
+
+class TestRandomResize:
+    def test__get_params(self):
+        min_size = 3
+        max_size = 6
+
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
+
+        for _ in range(10):
+            params = transform._get_params([])
+
+            assert isinstance(params["size"], list) and len(params["size"]) == 1
+            size = params["size"][0]
+
+            assert min_size <= size < max_size
+
+    def test__transform(self, mocker):
+        interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
+        antialias_sentinel = mocker.MagicMock()
+
+        transform = transforms.RandomResize(
+            min_size=-1, max_size=-1, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+        transform._transformed_types = (mocker.MagicMock,)
+
+        size_sentinel = mocker.MagicMock()
+        mocker.patch(
+            "torchvision.transforms.v2._geometry.RandomResize._get_params",
+            return_value=dict(size=size_sentinel),
+        )
+
+        inpt_sentinel = mocker.MagicMock()
+
+        mock_resize = mocker.patch("torchvision.transforms.v2._geometry.F.resize")
+        transform(inpt_sentinel)
+
+        mock_resize.assert_called_with(
+            inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
+        )
+
+
+class TestToDtype:
+    @pytest.mark.parametrize(
+        ("dtype", "expected_dtypes"),
+        [
+            (
+                torch.float64,
+                {
+                    datapoints.Video: torch.float64,
+                    datapoints.Image: torch.float64,
+                    datapoints.BoundingBox: torch.float64,
+                },
+            ),
+            (
+                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+                {datapoints.Video: torch.int32, datapoints.Image: torch.float32, datapoints.BoundingBox: torch.float64},
+            ),
+        ],
+    )
+    def test_call(self, dtype, expected_dtypes):
+        sample = dict(
+            video=make_video(dtype=torch.int64),
+            image=make_image(dtype=torch.uint8),
+            bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
+            str="str",
+            int=0,
+        )
+
+        transform = transforms.ToDtype(dtype)
+        transformed_sample = transform(sample)
+
+        for key, value in sample.items():
+            value_type = type(value)
+            transformed_value = transformed_sample[key]
+
+            # make sure the transformation retains the type
+            assert isinstance(transformed_value, value_type)
+
+            if isinstance(value, torch.Tensor):
+                assert transformed_value.dtype is expected_dtypes[value_type]
+            else:
+                assert transformed_value is value
+
+    @pytest.mark.filterwarnings("error")
+    def test_plain_tensor_call(self):
+        tensor = torch.empty((), dtype=torch.float32)
+        transform = transforms.ToDtype({torch.Tensor: torch.float64})
+
+        assert transform(tensor).dtype is torch.float64
+
+    @pytest.mark.parametrize("other_type", [datapoints.Image, datapoints.Video])
+    def test_plain_tensor_warning(self, other_type):
+        with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")):
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, other_type: torch.float64})
+
+
+class TestUniformTemporalSubsample:
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.zeros(10, 3, 8, 8),
+            torch.zeros(1, 10, 3, 8, 8),
+            datapoints.Video(torch.zeros(1, 10, 3, 8, 8)),
+        ],
+    )
+    def test__transform(self, inpt):
+        num_samples = 5
+        transform = transforms.UniformTemporalSubsample(num_samples)
+
+        output = transform(inpt)
+        assert type(output) is type(inpt)
+        assert output.shape[-4] == num_samples
+        assert output.dtype == inpt.dtype
+
+
+# TODO: remove this test in 0.17 when the default of antialias changes to True
+def test_antialias_warning():
+    pil_img = PIL.Image.new("RGB", size=(10, 10), color=127)
+    tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8)
+    tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
+
+    match = "The default value of the antialias parameter"
+    with pytest.warns(UserWarning, match=match):
+        transforms.Resize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResizedCrop((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.ScaleJitter((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomShortestSize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResize(10, 20)(tensor_img)
+
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize(tensor_img, (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize_image_tensor(tensor_img, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize(tensor_video, (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        transforms.functional.resize_video(tensor_video, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Image(tensor_img).resize((20, 20))
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Video(tensor_video).resize((20, 20))
+    with pytest.warns(UserWarning, match=match):
+        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20))
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        transforms.Resize((20, 20))(pil_img)
+        transforms.RandomResizedCrop((20, 20))(pil_img)
+        transforms.ScaleJitter((20, 20))(pil_img)
+        transforms.RandomShortestSize((20, 20))(pil_img)
+        transforms.RandomResize(10, 20)(pil_img)
+        transforms.functional.resize(pil_img, (20, 20))
+
+        transforms.Resize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
+        transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
+        transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResize(10, 20, antialias=True)(tensor_img)
+
+        transforms.functional.resize(tensor_img, (20, 20), antialias=True)
+        transforms.functional.resize_image_tensor(tensor_img, (20, 20), antialias=True)
+        transforms.functional.resize(tensor_video, (20, 20), antialias=True)
+        transforms.functional.resize_video(tensor_video, (20, 20), antialias=True)
+
+        datapoints.Image(tensor_img).resize((20, 20), antialias=True)
+        datapoints.Image(tensor_img).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+        datapoints.Video(tensor_video).resize((20, 20), antialias=True)
+        datapoints.Video(tensor_video).resized_crop(0, 0, 10, 10, (20, 20), antialias=True)
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("label_type", (torch.Tensor, int))
+@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
+
+    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_simple_tensor(image)
+
+    label = 1 if label_type is int else torch.tensor([1])
+
+    if dataset_return_type is dict:
+        sample = {
+            "image": image,
+            "label": label,
+        }
+    else:
+        sample = image, label
+
+    t = transforms.Compose(
+        [
+            transforms.RandomResizedCrop((224, 224)),
+            transforms.RandomHorizontalFlip(p=1),
+            transforms.RandAugment(),
+            transforms.TrivialAugmentWide(),
+            transforms.AugMix(),
+            transforms.AutoAugment(),
+            to_tensor(),
+            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
+            # intended?  This results in a failure if we convert to tensor after
+            # it, because the image would still be uint8 which make Normalize
+            # fail.
+            transforms.ConvertImageDtype(torch.float),
+            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
+            transforms.RandomErasing(p=1),
+        ]
+    )
+
+    out = t(sample)
+
+    assert type(out) == type(sample)
+
+    if dataset_return_type is tuple:
+        out_image, out_label = out
+    else:
+        assert out.keys() == sample.keys()
+        out_image, out_label = out.values()
+
+    assert out_image.shape[-2:] == (224, 224)
+    assert out_label == label
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image))
+@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImageTensor))
+@pytest.mark.parametrize("sanitize", (True, False))
+def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
+    torch.manual_seed(0)
+    if data_augmentation == "hflip":
+        t = [
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "lsj":
+        t = [
+            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
+            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
+            # leaving FixedSizeCrop in prototype for now, and it expects Label
+            # classes which we won't release yet.
+            # transforms.FixedSizeCrop(
+            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})
+            # ),
+            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "multiscale":
+        t = [
+            transforms.RandomShortestSize(
+                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
+            ),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssd":
+        t = [
+            transforms.RandomPhotometricDistort(p=1),
+            transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssdlite":
+        t = [
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor(),
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    if sanitize:
+        t += [transforms.SanitizeBoundingBoxes()]
+    t = transforms.Compose(t)
+
+    num_boxes = 5
+    H = W = 250
+
+    image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_simple_tensor(image)
+
+    label = torch.randint(0, 10, size=(num_boxes,))
+
+    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = boxes.clamp(min=0, max=min(H, W))
+    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
+
+    masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
+
+    sample = {
+        "image": image,
+        "label": label,
+        "boxes": boxes,
+        "masks": masks,
+    }
+
+    out = t(sample)
+
+    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
+        assert is_simple_tensor(out["image"])
+    else:
+        assert isinstance(out["image"], datapoints.Image)
+    assert isinstance(out["label"], type(sample["label"]))
+
+    num_boxes_expected = {
+        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
+        # doesn't remove them strictly speaking, it just marks some boxes as
+        # degenerate and those boxes will be later removed by
+        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # param is True.
+        # Note that the values below are probably specific to the random seed
+        # set above (which is fine).
+        (True, "ssd"): 4,
+        (True, "ssdlite"): 4,
+    }.get((sanitize, data_augmentation), num_boxes)
+
+    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
+
+
+@pytest.mark.parametrize("min_size", (1, 10))
+@pytest.mark.parametrize(
+    "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
+)
+def test_sanitize_bounding_boxes(min_size, labels_getter):
+    H, W = 256, 128
+
+    boxes_and_validity = [
+        ([0, 1, 10, 1], False),  # Y1 == Y2
+        ([0, 1, 0, 20], False),  # X1 == X2
+        ([0, 0, min_size - 1, 10], False),  # H < min_size
+        ([0, 0, 10, min_size - 1], False),  # W < min_size
+        ([0, 0, 10, H + 1], False),  # Y2 > H
+        ([0, 0, W + 1, 10], False),  # X2 > W
+        ([-1, 1, 10, 20], False),  # any < 0
+        ([0, 0, -1, 20], False),  # any < 0
+        ([0, 0, -10, -1], False),  # any < 0
+        ([0, 0, min_size, 10], True),  # H < min_size
+        ([0, 0, 10, min_size], True),  # W < min_size
+        ([0, 0, W, H], True),  # TODO: Is that actually OK?? Should it be -1?
+        ([1, 1, 30, 20], True),
+        ([0, 0, 10, 10], True),
+        ([1, 1, 30, 20], True),
+    ]
+
+    random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
+    boxes, is_valid_mask = zip(*boxes_and_validity)
+    valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
+
+    boxes = torch.tensor(boxes)
+    labels = torch.arange(boxes.shape[0])
+
+    boxes = datapoints.BoundingBox(
+        boxes,
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=(H, W),
+    )
+
+    masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
+
+    sample = {
+        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
+        "labels": labels,
+        "boxes": boxes,
+        "whatever": torch.rand(10),
+        "None": None,
+        "masks": masks,
+    }
+
+    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+
+    assert out["image"] is sample["image"]
+    assert out["whatever"] is sample["whatever"]
+
+    if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
+        assert out["labels"] is sample["labels"]
+    else:
+        assert isinstance(out["labels"], torch.Tensor)
+        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
+        # This works because we conveniently set labels to arange(num_boxes)
+        assert out["labels"].tolist() == valid_indices
+
+
+@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
+def test_sanitize_bounding_boxes_default_heuristic(key):
+    labels = torch.arange(10)
+    d = {key: labels}
+    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+
+    if key.lower() != "labels":
+        # If "labels" is in the dict (case-insensitive),
+        # it takes precedence over other keys which would otherwise be a match
+        d = {key: "something_else", "labels": labels}
+        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+
+
+def test_sanitize_bounding_boxes_errors():
+
+    good_bbox = datapoints.BoundingBox(
+        [[0, 0, 10, 10]],
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=(20, 20),
+    )
+
+    with pytest.raises(ValueError, match="min_size must be >= 1"):
+        transforms.SanitizeBoundingBoxes(min_size=0)
+    with pytest.raises(ValueError, match="labels_getter should either be a str"):
+        transforms.SanitizeBoundingBoxes(labels_getter=12)
+
+    with pytest.raises(ValueError, match="Could not infer where the labels are"):
+        bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+
+    with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
+        not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
+        transforms.SanitizeBoundingBoxes()(not_a_dict)
+
+    with pytest.raises(ValueError, match="must be a tensor"):
+        not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
+        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+
+    with pytest.raises(ValueError, match="Number of boxes"):
+        different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
+
+    with pytest.raises(ValueError, match="boxes must be of shape"):
+        bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
+            [
+                [[0, 0, 10, 10]],
+                [[0, 0, 10, 10]],
+            ],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=(20, 20),
+        )
+        different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -12,9 +12,8 @@ import PIL.Image
 import pytest

 import torch
-import torchvision.prototype.transforms as prototype_transforms
 import torchvision.transforms.v2 as v2_transforms
-from prototype_common_utils import (
+from common_utils import (
    ArgsKwargs,
    assert_close,
    assert_equal,
@@ -22,7 +21,6 @@ from prototype_common_utils import (
    make_detection_mask,
    make_image,
    make_images,
-    make_label,
    make_segmentation_mask,
 )
 from torch import nn
@@ -1056,6 +1054,9 @@ class TestRefDetTransforms:
        size = (600, 800)
        num_objects = 22

+        def make_label(extra_dims, categories):
+            return torch.randint(categories, extra_dims, dtype=torch.int64)
+
        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
        target = {
            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
@@ -1102,11 +1103,6 @@ class TestRefDetTransforms:
            ),
            (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}),
            (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024)), {}),
-            (
-                det_transforms.FixedSizeCrop((1024, 1024), fill=0),
-                prototype_transforms.FixedSizeCrop((1024, 1024), fill=0),
-                {},
-            ),
            (
                det_transforms.RandomShortestSize(
                    min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333

--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -11,15 +11,16 @@ import pytest

 import torch

-from common_utils import cache, cpu_and_gpu, needs_cuda, set_rng_seed
-from prototype_common_utils import (
+from common_utils import (
    assert_close,
+    cache,
+    cpu_and_gpu,
    DEFAULT_SQUARE_SPATIAL_SIZE,
    make_bounding_boxes,
+    needs_cuda,
    parametrized_error_message,
+    set_rng_seed,
 )
-from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
-from prototype_transforms_kernel_infos import KERNEL_INFOS
 from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
@@ -27,6 +28,8 @@ from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_box, convert_format_bounding_box
 from torchvision.transforms.v2.utils import is_simple_tensor
+from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
+from transforms_v2_kernel_infos import KERNEL_INFOS


 KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
@@ -635,7 +638,7 @@ class TestConvertFormatBoundingBox:


 # TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
-#  `prototype_transforms_kernel_infos.py`
+#  `transforms_v2_kernel_infos.py`


 def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):

--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -4,7 +4,7 @@ import pytest
 import torch

 import torchvision.transforms.v2.utils
-from prototype_common_utils import make_bounding_box, make_detection_mask, make_image
+from common_utils import make_bounding_box, make_detection_mask, make_image

 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_image_pil

--- a/test/prototype_transforms_dispatcher_infos.py
+++ b/test/prototype_transforms_dispatcher_infos.py
@@ -2,9 +2,9 @@ import collections.abc

 import pytest
 import torchvision.transforms.v2.functional as F
-from prototype_common_utils import InfoBase, TestMark
-from prototype_transforms_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
+from common_utils import InfoBase, TestMark
 from torchvision import datapoints
+from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition

 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]

@@ -49,7 +49,7 @@ class DispatcherInfo(InfoBase):
            if not kernel_info:
                raise pytest.UsageError(
                    f"Can't register {kernel.__name__} for type {datapoint_type} since there is no `KernelInfo` for it. "
-                    f"Please add a `KernelInfo` for it in `prototype_transforms_kernel_infos.py`."
+                    f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`."
                )
            kernel_infos[datapoint_type] = kernel_info
        self.kernel_infos = kernel_infos

--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -9,9 +9,9 @@ import pytest
 import torch.testing
 import torchvision.ops
 import torchvision.transforms.v2.functional as F
-from datasets_utils import combinations_grid
-from prototype_common_utils import (
+from common_utils import (
    ArgsKwargs,
+    combinations_grid,
    get_num_channels,
    ImageLoader,
    InfoBase,