[prototype] Switch to `spatial_size` (#6736)

* Change `image_size` to `spatial_size` * Fix linter * Fixing more tests. * Adding get_num_channels_video and get_spatial_size_* kernels for video, masks and bboxes. * Refactor get_spatial_size * Reduce the usage of `query_chw` where possible * Rename `query_chw` to `query_spatial_size` * Adding `get_num_frames` dispatcher and kernel. * Adding jit-scriptability tests

[prototype] Switch to `spatial_size` (#6736)
* Change `image_size` to `spatial_size` * Fix linter * Fixing more tests. * Adding get_num_channels_video and get_spatial_size_* kernels for video, masks and bboxes. * Refactor get_spatial_size * Reduce the usage of `query_chw` where possible * Rename `query_chw` to `query_spatial_size` * Adding `get_num_frames` dispatcher and kernel. * Adding jit-scriptability tests
4d4711d9 · Vasilis Vryniotis · GitHub · 3099e0cc · 4d4711d9 · 4d4711d9
Unverified Commit 4d4711d9 authored Oct 11, 2022 by Vasilis Vryniotis Committed by GitHub Oct 11, 2022
20 changed files
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -184,13 +184,18 @@ class ArgsKwargs:
        return args, kwargs
-DEFAULT_SQUARE_IMAGE_SIZE = 15
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
-DEFAULT_LANDSCAPE_IMAGE_SIZE = (7, 33)
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
-DEFAULT_PORTRAIT_IMAGE_SIZE = (31, 9)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
-DEFAULT_IMAGE_SIZES = (DEFAULT_LANDSCAPE_IMAGE_SIZE, DEFAULT_PORTRAIT_IMAGE_SIZE, DEFAULT_SQUARE_IMAGE_SIZE, "random")
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    "random",
+)
-def _parse_image_size(size, *, name="size"):
+def _parse_spatial_size(size, *, name="size"):
    if size == "random":
        return tuple(torch.randint(15, 33, (2,)).tolist())
    elif isinstance(size, int) and size > 0:
@@ -246,11 +251,11 @@ class TensorLoader:
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
    color_space: features.ColorSpace
-    image_size: Tuple[int, int] = dataclasses.field(init=False)
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
    num_channels: int = dataclasses.field(init=False)
    def __post_init__(self):
-        self.image_size = self.shape[-2:]
+        self.spatial_size = self.shape[-2:]
        self.num_channels = self.shape[-3]
@@ -277,7 +282,7 @@ def make_image_loader(
    dtype=torch.float32,
    constant_alpha=True,
 ):
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
    num_channels = get_num_channels(color_space)
    def fn(shape, dtype, device):
@@ -295,7 +300,7 @@ make_image = from_loader(make_image_loader)
 def make_image_loaders(
    *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
    color_spaces=(
        features.ColorSpace.GRAY,
        features.ColorSpace.GRAY_ALPHA,
@@ -316,7 +321,7 @@ make_images = from_loaders(make_image_loaders)
 @dataclasses.dataclass
 class BoundingBoxLoader(TensorLoader):
    format: features.BoundingBoxFormat
-    image_size: Tuple[int, int]
+    spatial_size: Tuple[int, int]
 def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
@@ -331,7 +336,7 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
    ).reshape(low.shape)
-def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
    if isinstance(format, str):
        format = features.BoundingBoxFormat[format]
    if format not in {
@@ -341,7 +346,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
    }:
        raise pytest.UsageError(f"Can't make bounding box in format {format}")
-    image_size = _parse_image_size(image_size, name="image_size")
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
    def fn(shape, dtype, device):
        *extra_dims, num_coordinates = shape
@@ -350,10 +355,10 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
        if any(dim == 0 for dim in extra_dims):
            return features.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, image_size=image_size
+                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
            )
-        height, width = image_size
+        height, width = spatial_size
        if format == features.BoundingBoxFormat.XYXY:
            x1 = torch.randint(0, width // 2, extra_dims)
@@ -375,10 +380,10 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
            parts = (cx, cy, w, h)
        return features.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, image_size=image_size
+            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
        )
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, image_size=image_size)
+    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 make_bounding_box = from_loader(make_bounding_box_loader)
@@ -388,11 +393,11 @@ def make_bounding_box_loaders(
    *,
    extra_dims=DEFAULT_EXTRA_DIMS,
    formats=tuple(features.BoundingBoxFormat),
-    image_size="random",
+    spatial_size="random",
    dtypes=(torch.float32, torch.int64),
 ):
    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, image_size=image_size)
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
 make_bounding_boxes = from_loaders(make_bounding_box_loaders)
@@ -475,7 +480,7 @@ class MaskLoader(TensorLoader):
 def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
    def fn(shape, dtype, device):
@@ -489,7 +494,7 @@ make_detection_mask = from_loader(make_detection_mask_loader)
 def make_detection_mask_loaders(
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
    num_objects=(1, 0, "random"),
    extra_dims=DEFAULT_EXTRA_DIMS,
    dtypes=(torch.uint8,),
@@ -503,7 +508,7 @@ make_detection_masks = from_loaders(make_detection_mask_loaders)
 def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
    def fn(shape, dtype, device):
@@ -518,7 +523,7 @@ make_segmentation_mask = from_loader(make_segmentation_mask_loader)
 def make_segmentation_mask_loaders(
    *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
    num_categories=(1, 2, "random"),
    extra_dims=DEFAULT_EXTRA_DIMS,
    dtypes=(torch.uint8,),
@@ -532,7 +537,7 @@ make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
 def make_mask_loaders(
    *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
    num_objects=(1, 0, "random"),
    num_categories=(1, 2, "random"),
    extra_dims=DEFAULT_EXTRA_DIMS,
@@ -559,7 +564,7 @@ def make_video_loader(
    extra_dims=(),
    dtype=torch.uint8,
 ):
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
    def fn(shape, dtype, device):
@@ -576,7 +581,7 @@ make_video = from_loader(make_video_loader)
 def make_video_loaders(
    *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
    color_spaces=(
        features.ColorSpace.GRAY,
        features.ColorSpace.RGB,

--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -145,7 +145,7 @@ def sample_inputs_horizontal_flip_bounding_box():
        formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
    ):
        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
        )
@@ -185,9 +185,9 @@ KERNEL_INFOS.extend(
 )
-def _get_resize_sizes(image_size):
+def _get_resize_sizes(spatial_size):
-    height, width = image_size
+    height, width = spatial_size
-    length = max(image_size)
+    length = max(spatial_size)
    yield length
    yield [length]
    yield (length,)
@@ -201,7 +201,7 @@ def sample_inputs_resize_image_tensor():
    for image_loader in make_image_loaders(
        sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
    ):
-        for size in _get_resize_sizes(image_loader.image_size):
+        for size in _get_resize_sizes(image_loader.spatial_size):
            yield ArgsKwargs(image_loader, size=size)
    for image_loader, interpolation in itertools.product(
@@ -212,7 +212,7 @@ def sample_inputs_resize_image_tensor():
            F.InterpolationMode.BICUBIC,
        ],
    ):
-        yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation)
+        yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation)
    yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25)
@@ -236,7 +236,7 @@ def reference_inputs_resize_image_tensor():
            F.InterpolationMode.BICUBIC,
        ],
    ):
-        for size in _get_resize_sizes(image_loader.image_size):
+        for size in _get_resize_sizes(image_loader.spatial_size):
            yield ArgsKwargs(
                image_loader,
                size=size,
@@ -251,8 +251,8 @@ def reference_inputs_resize_image_tensor():
 def sample_inputs_resize_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
-        for size in _get_resize_sizes(bounding_box_loader.image_size):
+        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size)
+            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
 def sample_inputs_resize_mask():
@@ -394,7 +394,7 @@ def sample_inputs_affine_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
            **affine_params,
        )
@@ -422,9 +422,9 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
    return true_matrix
-def reference_affine_bounding_box(bounding_box, *, format, image_size, angle, translate, scale, shear, center=None):
+def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
    if center is None:
-        center = [s * 0.5 for s in image_size[::-1]]
+        center = [s * 0.5 for s in spatial_size[::-1]]
    def transform(bbox):
        affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
@@ -473,7 +473,7 @@ def reference_inputs_affine_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
            **affine_kwargs,
        )
@@ -650,7 +650,7 @@ def sample_inputs_vertical_flip_bounding_box():
        formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
    ):
        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
        )
@@ -729,7 +729,7 @@ def sample_inputs_rotate_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
            angle=_ROTATE_ANGLES[0],
        )
@@ -1001,7 +1001,7 @@ def sample_inputs_pad_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
            padding=padding,
            padding_mode="constant",
        )
@@ -1131,13 +1131,13 @@ KERNEL_INFOS.extend(
 )
-def _get_elastic_displacement(image_size):
+def _get_elastic_displacement(spatial_size):
-    return torch.rand(1, *image_size, 2)
+    return torch.rand(1, *spatial_size, 2)
 def sample_inputs_elastic_image_tensor():
    for image_loader in make_image_loaders(sizes=["random"]):
-        displacement = _get_elastic_displacement(image_loader.image_size)
+        displacement = _get_elastic_displacement(image_loader.spatial_size)
        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
            yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
@@ -1151,14 +1151,14 @@ def reference_inputs_elastic_image_tensor():
            F.InterpolationMode.BICUBIC,
        ],
    ):
-        displacement = _get_elastic_displacement(image_loader.image_size)
+        displacement = _get_elastic_displacement(image_loader.spatial_size)
        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
            yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
 def sample_inputs_elastic_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
-        displacement = _get_elastic_displacement(bounding_box_loader.image_size)
+        displacement = _get_elastic_displacement(bounding_box_loader.spatial_size)
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
@@ -1212,7 +1212,7 @@ KERNEL_INFOS.extend(
 )
-_CENTER_CROP_IMAGE_SIZES = [(16, 16), (7, 33), (31, 9)]
+_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)]
 _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
@@ -1231,7 +1231,7 @@ def sample_inputs_center_crop_image_tensor():
 def reference_inputs_center_crop_image_tensor():
    for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
+        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
    ):
        yield ArgsKwargs(image_loader, output_size=output_size)
@@ -1241,7 +1241,7 @@ def sample_inputs_center_crop_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
+            spatial_size=bounding_box_loader.spatial_size,
            output_size=output_size,
        )
@@ -1254,7 +1254,7 @@ def sample_inputs_center_crop_mask():
 def reference_inputs_center_crop_mask():
    for mask_loader, output_size in itertools.product(
-        make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
+        make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
    ):
        yield ArgsKwargs(mask_loader, output_size=output_size)
@@ -1820,7 +1820,7 @@ KERNEL_INFOS.extend(
 def sample_inputs_clamp_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
        )
@@ -1834,7 +1834,7 @@ KERNEL_INFOS.append(
 _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
-def _get_five_ten_crop_image_size(size):
+def _get_five_ten_crop_spatial_size(size):
    if isinstance(size, int):
        crop_height = crop_width = size
    elif len(size) == 1:
@@ -1847,28 +1847,32 @@ def _get_five_ten_crop_image_size(size):
 def sample_inputs_five_crop_image_tensor():
    for size in _FIVE_TEN_CROP_SIZES:
        for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+            sizes=[_get_five_ten_crop_spatial_size(size)],
+            color_spaces=[features.ColorSpace.RGB],
+            dtypes=[torch.float32],
        ):
            yield ArgsKwargs(image_loader, size=size)
 def reference_inputs_five_crop_image_tensor():
    for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
            yield ArgsKwargs(image_loader, size=size)
 def sample_inputs_ten_crop_image_tensor():
    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
        for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+            sizes=[_get_five_ten_crop_spatial_size(size)],
+            color_spaces=[features.ColorSpace.RGB],
+            dtypes=[torch.float32],
        ):
            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 def reference_inputs_ten_crop_image_tensor():
    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)

--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -298,7 +298,7 @@ class TestRandomHorizontalFlip:
        assert_equal(features.Mask(expected), actual)
    def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
+        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
        transform = transforms.RandomHorizontalFlip(p=p)
        actual = transform(input)
@@ -307,7 +307,7 @@ class TestRandomHorizontalFlip:
        expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
        assert_equal(expected, actual)
        assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
+        assert actual.spatial_size == expected.spatial_size
 @pytest.mark.parametrize("p", [0.0, 1.0])
@@ -351,7 +351,7 @@ class TestRandomVerticalFlip:
        assert_equal(features.Mask(expected), actual)
    def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
+        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10))
        transform = transforms.RandomVerticalFlip(p=p)
        actual = transform(input)
@@ -360,7 +360,7 @@ class TestRandomVerticalFlip:
        expected = features.BoundingBox.wrap_like(input, expected_image_tensor)
        assert_equal(expected, actual)
        assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
+        assert actual.spatial_size == expected.spatial_size
 class TestPad:
@@ -435,7 +435,7 @@ class TestRandomZoomOut:
        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
        image = mocker.MagicMock(spec=features.Image)
-        h, w = image.image_size = (24, 32)
+        h, w = image.spatial_size = (24, 32)
        params = transform._get_params(image)
@@ -450,7 +450,7 @@ class TestRandomZoomOut:
    def test__transform(self, fill, side_range, mocker):
        inpt = mocker.MagicMock(spec=features.Image)
        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
@@ -559,17 +559,17 @@ class TestRandomRotation:
    @pytest.mark.parametrize("angle", [34, -87])
    @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_image_size(self, angle, expand):
+    def test_boundingbox_spatial_size(self, angle, expand):
        # Specific test for BoundingBox.rotate
        bbox = features.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32)
+            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
        )
        img = features.Image(torch.rand(1, 3, 32, 32))
        out_img = img.rotate(angle, expand=expand)
        out_bbox = bbox.rotate(angle, expand=expand)
-        assert out_img.image_size == out_bbox.image_size
+        assert out_img.spatial_size == out_bbox.spatial_size
 class TestRandomAffine:
@@ -619,8 +619,8 @@ class TestRandomAffine:
    def test__get_params(self, degrees, translate, scale, shear, mocker):
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
-        h, w = image.image_size
+        h, w = image.spatial_size
        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
        params = transform._get_params(image)
@@ -682,7 +682,7 @@ class TestRandomAffine:
        fn = mocker.patch("torchvision.prototype.transforms.functional.affine")
        inpt = mocker.MagicMock(spec=features.Image)
        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
        # vfdev-5, Feature Request: let's store params as Transform attribute
        # This could be also helpful for users
@@ -718,8 +718,8 @@ class TestRandomCrop:
    def test__get_params(self, padding, pad_if_needed, size, mocker):
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
-        h, w = image.image_size
+        h, w = image.spatial_size
        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
        params = transform._get_params(image)
@@ -771,19 +771,19 @@ class TestRandomCrop:
        inpt = mocker.MagicMock(spec=features.Image)
        inpt.num_channels = 3
-        inpt.image_size = (32, 32)
+        inpt.spatial_size = (32, 32)
        expected = mocker.MagicMock(spec=features.Image)
        expected.num_channels = 3
        if isinstance(padding, int):
-            expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding)
+            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
        elif isinstance(padding, list):
-            expected.image_size = (
+            expected.spatial_size = (
-                inpt.image_size[0] + sum(padding[0::2]),
+                inpt.spatial_size[0] + sum(padding[0::2]),
-                inpt.image_size[1] + sum(padding[1::2]),
+                inpt.spatial_size[1] + sum(padding[1::2]),
            )
        else:
-            expected.image_size = inpt.image_size
+            expected.spatial_size = inpt.spatial_size
        _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected)
        fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop")
@@ -859,7 +859,7 @@ class TestGaussianBlur:
        fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur")
        inpt = mocker.MagicMock(spec=features.Image)
        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
        # vfdev-5, Feature Request: let's store params as Transform attribute
        # This could be also helpful for users
@@ -910,11 +910,11 @@ class TestRandomPerspective:
        transform = transforms.RandomPerspective(dscale)
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
        params = transform._get_params(image)
-        h, w = image.image_size
+        h, w = image.spatial_size
        assert "perspective_coeffs" in params
        assert len(params["perspective_coeffs"]) == 8
@@ -927,7 +927,7 @@ class TestRandomPerspective:
        fn = mocker.patch("torchvision.prototype.transforms.functional.perspective")
        inpt = mocker.MagicMock(spec=features.Image)
        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
        # vfdev-5, Feature Request: let's store params as Transform attribute
        # This could be also helpful for users
        # Otherwise, we can mock transform._get_params
@@ -971,11 +971,11 @@ class TestElasticTransform:
        transform = transforms.ElasticTransform(alpha, sigma)
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
        params = transform._get_params(image)
-        h, w = image.image_size
+        h, w = image.spatial_size
        displacement = params["displacement"]
        assert displacement.shape == (1, h, w, 2)
        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
@@ -1001,7 +1001,7 @@ class TestElasticTransform:
        fn = mocker.patch("torchvision.prototype.transforms.functional.elastic")
        inpt = mocker.MagicMock(spec=features.Image)
        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
+        inpt.spatial_size = (24, 32)
        # Let's mock transform._get_params to control the output:
        transform._get_params = mocker.MagicMock()
@@ -1030,7 +1030,7 @@ class TestRandomErasing:
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
@@ -1041,7 +1041,7 @@ class TestRandomErasing:
    def test__get_params(self, value, mocker):
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
        transform = transforms.RandomErasing(value=value)
        params = transform._get_params(image)
@@ -1057,8 +1057,8 @@ class TestRandomErasing:
        elif isinstance(value, (list, tuple)):
            assert v.shape == (image.num_channels, 1, 1)
-        assert 0 <= i <= image.image_size[0] - h
+        assert 0 <= i <= image.spatial_size[0] - h
-        assert 0 <= j <= image.image_size[1] - w
+        assert 0 <= j <= image.spatial_size[1] - w
    @pytest.mark.parametrize("p", [0, 1])
    def test__transform(self, mocker, p):
@@ -1222,11 +1222,11 @@ class TestRandomIoUCrop:
    def test__get_params(self, device, options, mocker):
        image = mocker.MagicMock(spec=features.Image)
        image.num_channels = 3
-        image.image_size = (24, 32)
+        image.spatial_size = (24, 32)
        bboxes = features.BoundingBox(
            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
            format="XYXY",
-            image_size=image.image_size,
+            spatial_size=image.spatial_size,
            device=device,
        )
        sample = [image, bboxes]
@@ -1245,8 +1245,8 @@ class TestRandomIoUCrop:
            assert len(params["is_within_crop_area"]) > 0
            assert params["is_within_crop_area"].dtype == torch.bool
-            orig_h = image.image_size[0]
+            orig_h = image.spatial_size[0]
-            orig_w = image.image_size[1]
+            orig_w = image.spatial_size[1]
            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
@@ -1261,7 +1261,7 @@ class TestRandomIoUCrop:
    def test__transform_empty_params(self, mocker):
        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
        image = features.Image(torch.rand(1, 3, 4, 4))
-        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4))
+        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
        label = features.Label(torch.tensor([1]))
        sample = [image, bboxes, label]
        # Let's mock transform._get_params to control the output:
@@ -1281,7 +1281,7 @@ class TestRandomIoUCrop:
        transform = transforms.RandomIoUCrop()
        image = features.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
        label = features.Label(torch.randint(0, 10, size=(6,)))
        ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
        masks = make_detection_mask((32, 24), num_objects=6)
@@ -1329,12 +1329,12 @@ class TestRandomIoUCrop:
 class TestScaleJitter:
    def test__get_params(self, mocker):
-        image_size = (24, 32)
+        spatial_size = (24, 32)
        target_size = (16, 12)
        scale_range = (0.5, 1.5)
        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
+        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
        n_samples = 5
        for _ in range(n_samples):
@@ -1347,11 +1347,11 @@ class TestScaleJitter:
            assert isinstance(size, tuple) and len(size) == 2
            height, width = size
-            r_min = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[0]
+            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[1]
+            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
-            assert int(image_size[0] * r_min) <= height <= int(image_size[0] * r_max)
+            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
-            assert int(image_size[1] * r_min) <= width <= int(image_size[1] * r_max)
+            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
    def test__transform(self, mocker):
        interpolation_sentinel = mocker.MagicMock()
@@ -1379,13 +1379,13 @@ class TestScaleJitter:
 class TestRandomShortestSize:
    def test__get_params(self, mocker):
-        image_size = (3, 10)
+        spatial_size = (3, 10)
        min_size = [5, 9]
        max_size = 20
        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
+        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
        params = transform._get_params(sample)
        assert "size" in params
@@ -1504,7 +1504,7 @@ class TestSimpleCopyPaste:
            labels = torch.nn.functional.one_hot(labels, num_classes=5)
        target = {
            "boxes": features.BoundingBox(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32)
+                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
            ),
            "masks": features.Mask(masks),
            "labels": label_type(labels),
@@ -1519,7 +1519,7 @@ class TestSimpleCopyPaste:
            paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
        paste_target = {
            "boxes": features.BoundingBox(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32)
+                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
            ),
            "masks": features.Mask(paste_masks),
            "labels": label_type(paste_labels),
@@ -1550,14 +1550,14 @@ class TestFixedSizeCrop:
    def test__get_params(self, mocker):
        crop_size = (7, 7)
        batch_shape = (10,)
-        image_size = (11, 5)
+        spatial_size = (11, 5)
        transform = transforms.FixedSizeCrop(size=crop_size)
        sample = dict(
-            image=make_image(size=image_size, color_space=features.ColorSpace.RGB),
+            image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB),
            bounding_boxes=make_bounding_box(
-                format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape
+                format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
            ),
        )
        params = transform._get_params(sample)
@@ -1638,7 +1638,7 @@ class TestFixedSizeCrop:
    def test__transform_culling(self, mocker):
        batch_size = 10
-        image_size = (10, 10)
+        spatial_size = (10, 10)
        is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
        mocker.patch(
@@ -1647,17 +1647,17 @@ class TestFixedSizeCrop:
                needs_crop=True,
                top=0,
                left=0,
-                height=image_size[0],
+                height=spatial_size[0],
-                width=image_size[1],
+                width=spatial_size[1],
                is_valid=is_valid,
                needs_pad=False,
            ),
        )
        bounding_boxes = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
        )
-        masks = make_detection_mask(size=image_size, extra_dims=(batch_size,))
+        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
        labels = make_label(extra_dims=(batch_size,))
        transform = transforms.FixedSizeCrop((-1, -1))
@@ -1678,7 +1678,7 @@ class TestFixedSizeCrop:
    def test__transform_bounding_box_clamping(self, mocker):
        batch_size = 3
-        image_size = (10, 10)
+        spatial_size = (10, 10)
        mocker.patch(
            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
@@ -1686,15 +1686,15 @@ class TestFixedSizeCrop:
                needs_crop=True,
                top=0,
                left=0,
-                height=image_size[0],
+                height=spatial_size[0],
-                width=image_size[1],
+                width=spatial_size[1],
                is_valid=torch.full((batch_size,), fill_value=True),
                needs_pad=False,
            ),
        )
        bounding_box = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
        )
        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")

--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -24,7 +24,7 @@ from torchvision import transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import features, transforms as prototype_transforms
 from torchvision.prototype.transforms import functional as F
-from torchvision.prototype.transforms._utils import query_chw
+from torchvision.prototype.transforms._utils import query_spatial_size
 from torchvision.prototype.transforms.functional import to_image_pil
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)])
@@ -871,7 +871,7 @@ class TestRefDetTransforms:
        pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB))
        target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
            "labels": make_label(extra_dims=(num_objects,), categories=80),
        }
        if with_mask:
@@ -881,7 +881,7 @@ class TestRefDetTransforms:
        tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB))
        target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
            "labels": make_label(extra_dims=(num_objects,), categories=80),
        }
        if with_mask:
@@ -891,7 +891,7 @@ class TestRefDetTransforms:
        feature_image = make_image(size=size, color_space=features.ColorSpace.RGB)
        target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
            "labels": make_label(extra_dims=(num_objects,), categories=80),
        }
        if with_mask:
@@ -949,7 +949,7 @@ class PadIfSmaller(prototype_transforms.Transform):
        self.fill = prototype_transforms._geometry._setup_fill_arg(fill)
    def _get_params(self, sample):
-        _, height, width = query_chw(sample)
+        height, width = query_spatial_size(sample)
        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
        needs_padding = any(padding)
        return dict(padding=padding, needs_padding=needs_padding)

--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -224,11 +224,14 @@ class TestDispatchers:
    @pytest.mark.parametrize(
        "dispatcher",
        [
+            F.clamp_bounding_box,
            F.convert_color_space,
            F.convert_image_dtype,
            F.get_dimensions,
            F.get_image_num_channels,
            F.get_image_size,
+            F.get_num_channels,
+            F.get_num_frames,
            F.get_spatial_size,
            F.rgb_to_grayscale,
        ],
@@ -333,16 +336,16 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 def test_correctness_affine_bounding_box_on_fixed_input(device):
    # Check transformation against known expected output
-    image_size = (64, 64)
+    spatial_size = (64, 64)
    # xyxy format
    in_boxes = [
        [20, 25, 35, 45],
        [50, 5, 70, 22],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
+        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
        [1, 1, 5, 5],
    ]
    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
    )
    # Tested parameters
    angle = 63
@@ -355,9 +358,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
    # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox
    # expected_bboxes = []
    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *image_size)
+    #     n_in_box = normalize_bbox(in_box, *spatial_size)
-    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size)
+    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *spatial_size)
-    #     out_box = denormalize_bbox(n_out_box, *image_size)
+    #     out_box = denormalize_bbox(n_out_box, *spatial_size)
    #     expected_bboxes.append(out_box)
    expected_bboxes = [
        (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695),
@@ -369,9 +372,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
    output_boxes = F.affine_bounding_box(
        in_boxes,
        in_boxes.format,
-        in_boxes.image_size,
+        in_boxes.spatial_size,
        angle,
-        (dx * image_size[1], dy * image_size[0]),
+        (dx * spatial_size[1], dy * spatial_size[0]),
        scale,
        shear=(0, 0),
    )
@@ -406,7 +409,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
        affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
        affine_matrix = affine_matrix[:2, :]
-        height, width = bbox.image_size
+        height, width = bbox.spatial_size
        bbox_xyxy = convert_format_bounding_box(
            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
        )
@@ -444,7 +447,7 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
        out_bbox = features.BoundingBox(
            out_bbox,
            format=features.BoundingBoxFormat.XYXY,
-            image_size=(height, width),
+            spatial_size=(height, width),
            dtype=bbox.dtype,
            device=bbox.device,
        )
@@ -455,16 +458,16 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
            (height, width),
        )
-    image_size = (32, 38)
+    spatial_size = (32, 38)
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
-        output_bboxes, output_image_size = F.rotate_bounding_box(
+        output_bboxes, output_spatial_size = F.rotate_bounding_box(
            bboxes,
            bboxes_format,
-            image_size=bboxes_image_size,
+            spatial_size=bboxes_spatial_size,
            angle=angle,
            expand=expand,
            center=center,
@@ -472,38 +475,38 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
        center_ = center
        if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_image_size[::-1]]
+            center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]]
        if bboxes.ndim < 2:
            bboxes = [bboxes]
        expected_bboxes = []
        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
-            expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_)
+            expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_)
            expected_bboxes.append(expected_bbox)
        if len(expected_bboxes) > 1:
            expected_bboxes = torch.stack(expected_bboxes)
        else:
            expected_bboxes = expected_bboxes[0]
        torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0)
+        torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0)
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
    # Check transformation against known expected output
-    image_size = (64, 64)
+    spatial_size = (64, 64)
    # xyxy format
    in_boxes = [
        [1, 1, 5, 5],
-        [1, image_size[0] - 6, 5, image_size[0] - 2],
+        [1, spatial_size[0] - 6, 5, spatial_size[0] - 2],
-        [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2],
+        [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
+        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
    ]
    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
    )
    # Tested parameters
    angle = 45
@@ -535,7 +538,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
    output_boxes, _ = F.rotate_bounding_box(
        in_boxes,
        in_boxes.format,
-        in_boxes.image_size,
+        in_boxes.spatial_size,
        angle,
        expand=expand,
        center=center,
@@ -593,11 +596,11 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
        [50.0, 5.0, 70.0, 22.0],
        [45.0, 46.0, 56.0, 62.0],
    ]
-    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device)
+    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=size, device=device)
    if format != features.BoundingBoxFormat.XYXY:
        in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
-    output_boxes, output_image_size = F.crop_bounding_box(
+    output_boxes, output_spatial_size = F.crop_bounding_box(
        in_boxes,
        format,
        top,
@@ -610,7 +613,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
        output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
+    torch.testing.assert_close(output_spatial_size, size)
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -658,7 +661,7 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
        bbox[3] = (bbox[3] - top_) * size_[0] / height_
        return bbox
-    image_size = (100, 100)
+    spatial_size = (100, 100)
    # xyxy format
    in_boxes = [
        [10.0, 10.0, 20.0, 20.0],
@@ -670,18 +673,18 @@ def test_correctness_resized_crop_bounding_box(device, format, top, left, height
    expected_bboxes = torch.tensor(expected_bboxes, device=device)
    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
    )
    if format != features.BoundingBoxFormat.XYXY:
        in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
-    output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
    if format != features.BoundingBoxFormat.XYXY:
        output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
    torch.testing.assert_close(output_boxes, expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
+    torch.testing.assert_close(output_spatial_size, size)
 def _parse_padding(padding):
@@ -718,28 +721,28 @@ def test_correctness_pad_bounding_box(device, padding):
            bbox = bbox.to(bbox_dtype)
        return bbox
-    def _compute_expected_image_size(bbox, padding_):
+    def _compute_expected_spatial_size(bbox, padding_):
        pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
-        height, width = bbox.image_size
+        height, width = bbox.spatial_size
        return height + pad_up + pad_down, width + pad_left + pad_right
    for bboxes in make_bounding_boxes():
        bboxes = bboxes.to(device)
        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
-        output_boxes, output_image_size = F.pad_bounding_box(
+        output_boxes, output_spatial_size = F.pad_bounding_box(
-            bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding
+            bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding
        )
-        torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding))
+        torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding))
        if bboxes.ndim < 2 or bboxes.shape[0] == 0:
            bboxes = [bboxes]
        expected_bboxes = []
        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
            expected_bboxes.append(_compute_expected_bbox(bbox, padding))
        if len(expected_bboxes) > 1:
@@ -807,7 +810,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
        out_bbox = features.BoundingBox(
            np.array(out_bbox),
            format=features.BoundingBoxFormat.XYXY,
-            image_size=bbox.image_size,
+            spatial_size=bbox.spatial_size,
            dtype=bbox.dtype,
            device=bbox.device,
        )
@@ -815,15 +818,15 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
        )
-    image_size = (32, 38)
+    spatial_size = (32, 38)
    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
    inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
        bboxes = bboxes.to(device)
        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
        output_bboxes = F.perspective_bounding_box(
            bboxes,
@@ -836,7 +839,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
        expected_bboxes = []
        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
            expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
        if len(expected_bboxes) > 1:
            expected_bboxes = torch.stack(expected_bboxes)
@@ -853,14 +856,14 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
 def test_correctness_center_crop_bounding_box(device, output_size):
    def _compute_expected_bbox(bbox, output_size_):
        format_ = bbox.format
-        image_size_ = bbox.image_size
+        spatial_size_ = bbox.spatial_size
        bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH)
        if len(output_size_) == 1:
            output_size_.append(output_size_[-1])
-        cy = int(round((image_size_[0] - output_size_[0]) * 0.5))
+        cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((image_size_[1] - output_size_[1]) * 0.5))
+        cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5))
        out_bbox = [
            bbox[0].item() - cx,
            bbox[1].item() - cy,
@@ -870,7 +873,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
        out_bbox = features.BoundingBox(
            out_bbox,
            format=features.BoundingBoxFormat.XYWH,
-            image_size=output_size_,
+            spatial_size=output_size_,
            dtype=bbox.dtype,
            device=bbox.device,
        )
@@ -879,10 +882,10 @@ def test_correctness_center_crop_bounding_box(device, output_size):
    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
        bboxes = bboxes.to(device)
        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
+        bboxes_spatial_size = bboxes.spatial_size
-        output_boxes, output_image_size = F.center_crop_bounding_box(
+        output_boxes, output_spatial_size = F.center_crop_bounding_box(
-            bboxes, bboxes_format, bboxes_image_size, output_size
+            bboxes, bboxes_format, bboxes_spatial_size, output_size
        )
        if bboxes.ndim < 2:
@@ -890,7 +893,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
        expected_bboxes = []
        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
            expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
        if len(expected_bboxes) > 1:
@@ -898,7 +901,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
        else:
            expected_bboxes = expected_bboxes[0]
        torch.testing.assert_close(output_boxes, expected_bboxes)
-        torch.testing.assert_close(output_image_size, output_size)
+        torch.testing.assert_close(output_spatial_size, output_size)
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -926,11 +929,11 @@ def test_correctness_center_crop_mask(device, output_size):
 # Copied from test/test_functional_tensor.py
 @pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("image_size", ("small", "large"))
+@pytest.mark.parametrize("spatial_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
 @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma):
+def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma):
    fn = F.gaussian_blur_image_tensor
    # true_cv2_results = {
@@ -950,7 +953,7 @@ def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, s
    p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
    true_cv2_results = torch.load(p)
-    if image_size == "small":
+    if spatial_size == "small":
        tensor = (
            torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
        )

--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -11,8 +11,8 @@ from torchvision.prototype.transforms.functional import to_image_pil
 IMAGE = make_image(color_space=features.ColorSpace.RGB)
-BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size)
+BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
-MASK = make_detection_mask(size=IMAGE.image_size)
+MASK = make_detection_mask(size=IMAGE.spatial_size)
 @pytest.mark.parametrize(

--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -110,7 +110,9 @@ class Caltech101(Dataset):
            image=image,
            ann_path=ann_path,
            bounding_box=BoundingBox(
-                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", image_size=image.image_size
+                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]],
+                format="xyxy",
+                spatial_size=image.spatial_size,
            ),
            contour=_Feature(ann["obj_contour"].T),
        )

--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ b/torchvision/prototype/datasets/_builtin/celeba.py
@@ -144,7 +144,7 @@ class CelebA(Dataset):
            bounding_box=BoundingBox(
                [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")],
                format="xywh",
-                image_size=image.image_size,
+                spatial_size=image.spatial_size,
            ),
            landmarks={
                landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))

--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ b/torchvision/prototype/datasets/_builtin/coco.py
@@ -97,25 +97,29 @@ class Coco(Dataset):
        )
        return [images, meta]
-    def _segmentation_to_mask(self, segmentation: Any, *, is_crowd: bool, image_size: Tuple[int, int]) -> torch.Tensor:
+    def _segmentation_to_mask(
+        self, segmentation: Any, *, is_crowd: bool, spatial_size: Tuple[int, int]
+    ) -> torch.Tensor:
        from pycocotools import mask
        if is_crowd:
-            segmentation = mask.frPyObjects(segmentation, *image_size)
+            segmentation = mask.frPyObjects(segmentation, *spatial_size)
        else:
-            segmentation = mask.merge(mask.frPyObjects(segmentation, *image_size))
+            segmentation = mask.merge(mask.frPyObjects(segmentation, *spatial_size))
        return torch.from_numpy(mask.decode(segmentation)).to(torch.bool)
    def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        image_size = (image_meta["height"], image_meta["width"])
+        spatial_size = (image_meta["height"], image_meta["width"])
        labels = [ann["category_id"] for ann in anns]
        return dict(
            # TODO: create a segmentation feature
            segmentations=_Feature(
                torch.stack(
                    [
-                        self._segmentation_to_mask(ann["segmentation"], is_crowd=ann["iscrowd"], image_size=image_size)
+                        self._segmentation_to_mask(
+                            ann["segmentation"], is_crowd=ann["iscrowd"], spatial_size=spatial_size
+                        )
                        for ann in anns
                    ]
                )
@@ -125,7 +129,7 @@ class Coco(Dataset):
            bounding_boxes=BoundingBox(
                [ann["bbox"] for ann in anns],
                format="xywh",
-                image_size=image_size,
+                spatial_size=spatial_size,
            ),
            labels=Label(labels, categories=self._categories),
            super_categories=[self._category_to_super_category[self._categories[label]] for label in labels],

--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ b/torchvision/prototype/datasets/_builtin/cub200.py
@@ -130,13 +130,13 @@ class CUB200(Dataset):
        return path.with_suffix(".jpg").name
    def _2011_prepare_ann(
-        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], image_size: Tuple[int, int]
+        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int]
    ) -> Dict[str, Any]:
        _, (bounding_box_data, segmentation_data) = data
        segmentation_path, segmentation_buffer = segmentation_data
        return dict(
            bounding_box=BoundingBox(
-                [float(part) for part in bounding_box_data[1:]], format="xywh", image_size=image_size
+                [float(part) for part in bounding_box_data[1:]], format="xywh", spatial_size=spatial_size
            ),
            segmentation_path=segmentation_path,
            segmentation=EncodedImage.from_file(segmentation_buffer),
@@ -149,7 +149,9 @@ class CUB200(Dataset):
        path = pathlib.Path(data[0])
        return path.with_suffix(".jpg").name, data
-    def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: Tuple[int, int]) -> Dict[str, Any]:
+    def _2010_prepare_ann(
+        self, data: Tuple[str, Tuple[str, BinaryIO]], spatial_size: Tuple[int, int]
+    ) -> Dict[str, Any]:
        _, (path, buffer) = data
        content = read_mat(buffer)
        return dict(
@@ -157,7 +159,7 @@ class CUB200(Dataset):
            bounding_box=BoundingBox(
                [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
                format="xyxy",
-                image_size=image_size,
+                spatial_size=spatial_size,
            ),
            segmentation=_Feature(content["seg"]),
        )
@@ -175,7 +177,7 @@ class CUB200(Dataset):
        image = EncodedImage.from_file(buffer)
        return dict(
-            prepare_ann_fn(anns_data, image.image_size),
+            prepare_ann_fn(anns_data, image.spatial_size),
            image=image,
            label=Label(
                int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1,

--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -78,7 +78,7 @@ class GTSRB(Dataset):
        bounding_box = BoundingBox(
            [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
            format="xyxy",
-            image_size=(int(csv_info["Height"]), int(csv_info["Width"])),
+            spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])),
        )
        return {

--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py
@@ -89,7 +89,7 @@ class StanfordCars(Dataset):
            path=path,
            image=image,
            label=Label(target[4] - 1, categories=self._categories),
-            bounding_box=BoundingBox(target[:4], format="xyxy", image_size=image.image_size),
+            bounding_box=BoundingBox(target[:4], format="xyxy", spatial_size=image.spatial_size),
        )
    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:

--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ b/torchvision/prototype/datasets/_builtin/voc.py
@@ -108,7 +108,7 @@ class VOC(Dataset):
                    for instance in instances
                ],
                format="xyxy",
-                image_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
+                spatial_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
            ),
            labels=Label(
                [self._categories.index(instance["name"]) for instance in instances], categories=self._categories

--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -17,13 +17,13 @@ class BoundingBoxFormat(StrEnum):
 class BoundingBox(_Feature):
    format: BoundingBoxFormat
-    image_size: Tuple[int, int]
+    spatial_size: Tuple[int, int]
    @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, image_size: Tuple[int, int]) -> BoundingBox:
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBox:
        bounding_box = tensor.as_subclass(cls)
        bounding_box.format = format
-        bounding_box.image_size = image_size
+        bounding_box.spatial_size = spatial_size
        return bounding_box
    def __new__(
@@ -31,7 +31,7 @@ class BoundingBox(_Feature):
        data: Any,
        *,
        format: Union[BoundingBoxFormat, str],
-        image_size: Tuple[int, int],
+        spatial_size: Tuple[int, int],
        dtype: Optional[torch.dtype] = None,
        device: Optional[Union[torch.device, str, int]] = None,
        requires_grad: bool = False,
@@ -41,7 +41,7 @@ class BoundingBox(_Feature):
        if isinstance(format, str):
            format = BoundingBoxFormat.from_str(format.upper())
-        return cls._wrap(tensor, format=format, image_size=image_size)
+        return cls._wrap(tensor, format=format, spatial_size=spatial_size)
    @classmethod
    def wrap_like(
@@ -50,16 +50,16 @@ class BoundingBox(_Feature):
        tensor: torch.Tensor,
        *,
        format: Optional[BoundingBoxFormat] = None,
-        image_size: Optional[Tuple[int, int]] = None,
+        spatial_size: Optional[Tuple[int, int]] = None,
    ) -> BoundingBox:
        return cls._wrap(
            tensor,
            format=format if format is not None else other.format,
-            image_size=image_size if image_size is not None else other.image_size,
+            spatial_size=spatial_size if spatial_size is not None else other.spatial_size,
        )
    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, image_size=self.image_size)
+        return self._make_repr(format=self.format, spatial_size=self.spatial_size)
    def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
        if isinstance(format, str):
@@ -70,11 +70,11 @@ class BoundingBox(_Feature):
        )
    def horizontal_flip(self) -> BoundingBox:
-        output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size)
+        output = self._F.horizontal_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size)
        return BoundingBox.wrap_like(self, output)
    def vertical_flip(self) -> BoundingBox:
-        output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size)
+        output = self._F.vertical_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size)
        return BoundingBox.wrap_like(self, output)
    def resize(  # type: ignore[override]
@@ -84,20 +84,22 @@ class BoundingBox(_Feature):
        max_size: Optional[int] = None,
        antialias: bool = False,
    ) -> BoundingBox:
-        output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size)
+        output, spatial_size = self._F.resize_bounding_box(
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+            self, spatial_size=self.spatial_size, size=size, max_size=max_size
+        )
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
    def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        output, image_size = self._F.crop_bounding_box(
+        output, spatial_size = self._F.crop_bounding_box(
            self, self.format, top=top, left=left, height=height, width=width
        )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
    def center_crop(self, output_size: List[int]) -> BoundingBox:
-        output, image_size = self._F.center_crop_bounding_box(
+        output, spatial_size = self._F.center_crop_bounding_box(
-            self, format=self.format, image_size=self.image_size, output_size=output_size
+            self, format=self.format, spatial_size=self.spatial_size, output_size=output_size
        )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
    def resized_crop(
        self,
@@ -109,8 +111,8 @@ class BoundingBox(_Feature):
        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
        antialias: bool = False,
    ) -> BoundingBox:
-        output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
+        output, spatial_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
    def pad(
        self,
@@ -118,10 +120,10 @@ class BoundingBox(_Feature):
        fill: FillTypeJIT = None,
        padding_mode: str = "constant",
    ) -> BoundingBox:
-        output, image_size = self._F.pad_bounding_box(
+        output, spatial_size = self._F.pad_bounding_box(
-            self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode
+            self, format=self.format, spatial_size=self.spatial_size, padding=padding, padding_mode=padding_mode
        )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
    def rotate(
        self,
@@ -131,10 +133,10 @@ class BoundingBox(_Feature):
        fill: FillTypeJIT = None,
        center: Optional[List[float]] = None,
    ) -> BoundingBox:
-        output, image_size = self._F.rotate_bounding_box(
+        output, spatial_size = self._F.rotate_bounding_box(
-            self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center
+            self, format=self.format, spatial_size=self.spatial_size, angle=angle, expand=expand, center=center
        )
-        return BoundingBox.wrap_like(self, output, image_size=image_size)
+        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
    def affine(
        self,
@@ -149,7 +151,7 @@ class BoundingBox(_Feature):
        output = self._F.affine_bounding_box(
            self,
            self.format,
-            self.image_size,
+            self.spatial_size,
            angle,
            translate=translate,
            scale=scale,

--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/features/_encoded.py
@@ -49,12 +49,12 @@ class EncodedData(_Feature):
 class EncodedImage(EncodedData):
    # TODO: Use @functools.cached_property if we can depend on Python 3.8
    @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
-        if not hasattr(self, "_image_size"):
+        if not hasattr(self, "_spatial_size"):
            with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image:
-                self._image_size = image.height, image.width
+                self._spatial_size = image.height, image.width
-        return self._image_size
+        return self._spatial_size
 class EncodedVideo(EncodedData):

--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -105,7 +105,7 @@ class Image(_Feature):
        return self._make_repr(color_space=self.color_space)
    @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
        return cast(Tuple[int, int], tuple(self.shape[-2:]))
    @property

--- a/torchvision/prototype/features/_mask.py
+++ b/torchvision/prototype/features/_mask.py
@@ -33,7 +33,7 @@ class Mask(_Feature):
        return cls._wrap(tensor)
    @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
        return cast(Tuple[int, int], tuple(self.shape[-2:]))
    def horizontal_flip(self) -> Mask:

--- a/torchvision/prototype/features/_video.py
+++ b/torchvision/prototype/features/_video.py
@@ -54,9 +54,8 @@ class Video(_Feature):
    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
        return self._make_repr(color_space=self.color_space)
-    # TODO: rename this (and all instances of this term to spatial size)
    @property
-    def image_size(self) -> Tuple[int, int]:
+    def spatial_size(self) -> Tuple[int, int]:
        return cast(Tuple[int, int], tuple(self.shape[-2:]))
    @property

--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -11,7 +11,7 @@ from torchvision.prototype import features
 from torchvision.prototype.transforms import functional as F, InterpolationMode
 from ._transform import _RandomApplyTransform
-from ._utils import has_any, query_chw
+from ._utils import has_any, query_chw, query_spatial_size
 class RandomErasing(_RandomApplyTransform):
@@ -153,7 +153,7 @@ class RandomCutmix(_BaseMixupCutmix):
    def _get_params(self, sample: Any) -> Dict[str, Any]:
        lam = float(self._dist.sample(()))
-        _, H, W = query_chw(sample)
+        H, W = query_spatial_size(sample)
        r_x = torch.randint(W, ())
        r_y = torch.randint(H, ())

--- a/torchvision/prototype/transforms/_color.py
+++ b/torchvision/prototype/transforms/_color.py
@@ -100,7 +100,7 @@ class RandomPhotometricDistort(Transform):
        self.p = p
    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_channels, _, _ = query_chw(sample)
+        num_channels, *_ = query_chw(sample)
        return dict(
            zip(
                ["brightness", "contrast1", "saturation", "hue", "contrast2"],