clamp bounding boxes in some geometry kernels (#7215)

Co-authored-by: vfdev-5 <vfdev.5@gmail.com>

clamp bounding boxes in some geometry kernels (#7215)
Co-authored-by: vfdev-5 <vfdev.5@gmail.com>
602e8ca1 · Philip Meier · GitHub · 6af6bf45 · 602e8ca1 · 602e8ca1
Unverified Commit 602e8ca1 authored Feb 14, 2023 by Philip Meier Committed by GitHub Feb 14, 2023
8 changed files
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -108,7 +108,7 @@ def float32_vs_uint8_pixel_difference(atol=1, mae=False):
    }
-def scripted_vs_eager_double_pixel_difference(device, atol=1e-6, rtol=1e-6):
+def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6):
    return {
        (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False},
    }
@@ -211,10 +211,12 @@ def reference_horizontal_flip_bounding_box(bounding_box, *, format, spatial_size
            [-1, 0, spatial_size[1]],
            [0, 1, 0],
        ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
    )
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
    return expected_bboxes
@@ -322,7 +324,7 @@ def reference_inputs_resize_image_tensor():
 def sample_inputs_resize_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
+            yield ArgsKwargs(bounding_box_loader, spatial_size=bounding_box_loader.spatial_size, size=size)
 def sample_inputs_resize_mask():
@@ -344,19 +346,20 @@ def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=
            [new_width / old_width, 0, 0],
            [0, new_height / old_height, 0],
        ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
    )
    expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=datapoints.BoundingBoxFormat.XYXY, affine_matrix=affine_matrix
+        bounding_box,
+        format=bounding_box.format,
+        spatial_size=(new_height, new_width),
+        affine_matrix=affine_matrix,
    )
    return expected_bboxes, (new_height, new_width)
 def reference_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(
+    for bounding_box_loader in make_bounding_box_loaders(extra_dims=((), (4,))):
-        formats=[datapoints.BoundingBoxFormat.XYXY], extra_dims=((), (4,))
-    ):
        for size in _get_resize_sizes(bounding_box_loader.spatial_size):
            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
@@ -543,14 +546,17 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
    return true_matrix
-def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix):
+def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
-    def transform(bbox, affine_matrix_, format_):
+    def transform(bbox, affine_matrix_, format_, spatial_size_):
        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
        in_dtype = bbox.dtype
        if not torch.is_floating_point(bbox):
            bbox = bbox.float()
        bbox_xyxy = F.convert_format_bounding_box(
-            bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+            bbox.as_subclass(torch.Tensor),
+            old_format=format_,
+            new_format=datapoints.BoundingBoxFormat.XYXY,
+            inplace=True,
        )
        points = np.array(
            [
@@ -573,12 +579,15 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix)
        out_bbox = F.convert_format_bounding_box(
            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
        )
-        return out_bbox.to(dtype=in_dtype)
+        # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = out_bbox.to(dtype=in_dtype)
+        return out_bbox
    if bounding_box.ndim < 2:
        bounding_box = [bounding_box]
-    expected_bboxes = [transform(bbox, affine_matrix, format) for bbox in bounding_box]
+    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
    if len(expected_bboxes) > 1:
        expected_bboxes = torch.stack(expected_bboxes)
    else:
@@ -594,7 +603,9 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
    affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
    affine_matrix = affine_matrix[:2, :]
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
    return expected_bboxes
@@ -643,9 +654,6 @@ KERNEL_INFOS.extend(
            sample_inputs_fn=sample_inputs_affine_bounding_box,
            reference_fn=reference_affine_bounding_box,
            reference_inputs_fn=reference_inputs_affine_bounding_box,
-            closeness_kwargs={
-                (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
-            },
            test_marks=[
                xfail_jit_python_scalar_arg("shear"),
            ],
@@ -729,10 +737,12 @@ def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
            [1, 0, 0],
            [0, -1, spatial_size[0]],
        ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
    )
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
    return expected_bboxes
@@ -806,6 +816,43 @@ def sample_inputs_rotate_bounding_box():
        )
+def reference_inputs_rotate_bounding_box():
+    for bounding_box_loader, angle in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), _ROTATE_ANGLES
+    ):
+        yield ArgsKwargs(
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
+            angle=angle,
+        )
+    # TODO: add samples with expand=True and center
+def reference_rotate_bounding_box(bounding_box, *, format, spatial_size, angle, expand=False, center=None):
+    if center is None:
+        center = [spatial_size[1] * 0.5, spatial_size[0] * 0.5]
+    a = np.cos(angle * np.pi / 180.0)
+    b = np.sin(angle * np.pi / 180.0)
+    cx = center[0]
+    cy = center[1]
+    affine_matrix = np.array(
+        [
+            [a, b, cx - cx * a - b * cy],
+            [-b, a, cy + cx * b - a * cy],
+        ],
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
+    )
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
+    return expected_bboxes, spatial_size
 def sample_inputs_rotate_mask():
    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
        yield ArgsKwargs(mask_loader, angle=15.0)
@@ -834,9 +881,11 @@ KERNEL_INFOS.extend(
        KernelInfo(
            F.rotate_bounding_box,
            sample_inputs_fn=sample_inputs_rotate_bounding_box,
+            reference_fn=reference_rotate_bounding_box,
+            reference_inputs_fn=reference_inputs_rotate_bounding_box,
            closeness_kwargs={
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
-                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
            },
        ),
        KernelInfo(
@@ -897,17 +946,19 @@ def sample_inputs_crop_video():
 def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
    affine_matrix = np.array(
        [
            [1, 0, -left],
            [0, 1, -top],
        ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
    )
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    spatial_size = (height, width)
-    return expected_bboxes, (height, width)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+    )
+    return expected_bboxes, spatial_size
 def reference_inputs_crop_bounding_box():
@@ -1119,13 +1170,15 @@ def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, p
            [1, 0, left],
            [0, 1, top],
        ],
-        dtype="float32",
+        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
    )
    height = spatial_size[0] + top + bottom
    width = spatial_size[1] + left + right
-    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    expected_bboxes = reference_affine_bounding_box_helper(
+        bounding_box, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
+    )
    return expected_bboxes, (height, width)
@@ -1225,14 +1278,16 @@ def sample_inputs_perspective_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
            startpoints=None,
            endpoints=None,
            coefficients=_PERSPECTIVE_COEFFS[0],
        )
    format = datapoints.BoundingBoxFormat.XYXY
+    loader = make_bounding_box_loader(format=format)
    yield ArgsKwargs(
-        make_bounding_box_loader(format=format), format=format, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
+        loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
    )
@@ -1269,13 +1324,17 @@ KERNEL_INFOS.extend(
                **pil_reference_pixel_difference(2, mae=True),
                **cuda_vs_cpu_pixel_difference(),
                **float32_vs_uint8_pixel_difference(),
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
            },
        ),
        KernelInfo(
            F.perspective_bounding_box,
            sample_inputs_fn=sample_inputs_perspective_bounding_box,
+            closeness_kwargs={
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
+            },
        ),
        KernelInfo(
            F.perspective_mask,
@@ -1292,8 +1351,8 @@ KERNEL_INFOS.extend(
            sample_inputs_fn=sample_inputs_perspective_video,
            closeness_kwargs={
                **cuda_vs_cpu_pixel_difference(),
-                **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
            },
        ),
    ]
@@ -1331,6 +1390,7 @@ def sample_inputs_elastic_bounding_box():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
            displacement=displacement,
        )

--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -146,7 +146,7 @@ class TestSmoke:
            (transforms.RandomZoomOut(p=1.0), None),
            (transforms.Resize([16, 16], antialias=True), None),
            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
-            (transforms.ClampBoundingBoxes(), None),
+            (transforms.ClampBoundingBox(), None),
            (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
            (transforms.ConvertDtype(), None),
            (transforms.GaussianBlur(kernel_size=3), None),

--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -25,7 +25,7 @@ from torch.utils._pytree import tree_map
 from torchvision.prototype import datapoints
 from torchvision.prototype.transforms import functional as F
 from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding
-from torchvision.prototype.transforms.functional._meta import convert_format_bounding_box
+from torchvision.prototype.transforms.functional._meta import clamp_bounding_box, convert_format_bounding_box
 from torchvision.transforms.functional import _get_perspective_coeffs
@@ -257,16 +257,17 @@ class TestKernels:
    @reference_inputs
    def test_against_reference(self, test_id, info, args_kwargs):
        (input, *other_args), kwargs = args_kwargs.load("cpu")
-        input = input.as_subclass(torch.Tensor)
-        actual = info.kernel(input, *other_args, **kwargs)
+        actual = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
+        # We intnetionally don't unwrap the input of the reference function in order for it to have access to all
+        # metadata regardless of whether the kernel takes it explicitly or not
        expected = info.reference_fn(input, *other_args, **kwargs)
        assert_close(
            actual,
            expected,
            **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
-            msg=parametrized_error_message(*other_args, **kwargs),
+            msg=parametrized_error_message(input, *other_args, **kwargs),
        )
    @make_info_args_kwargs_parametrization(
@@ -682,6 +683,10 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
        (48.56528888843238, 9.611532109828834, 53.35347829361575, 14.39972151501221),
    ]
+    expected_bboxes = clamp_bounding_box(
+        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+    ).tolist()
    output_boxes = F.affine_bounding_box(
        in_boxes,
        format=format,
@@ -762,7 +767,8 @@ def test_correctness_rotate_bounding_box(angle, expand, center):
            dtype=bbox.dtype,
            device=bbox.device,
        )
-        return convert_format_bounding_box(out_bbox, new_format=bbox.format), (height, width)
+        out_bbox = clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
+        return out_bbox, (height, width)
    spatial_size = (32, 38)
@@ -839,6 +845,9 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
            [69.27564928, 12.39339828, 74.93250353, 18.05025253],
            [18.36396103, 1.07968978, 46.64823228, 29.36396103],
        ]
+        expected_bboxes = clamp_bounding_box(
+            datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+        ).tolist()
    output_boxes, _ = F.rotate_bounding_box(
        in_boxes,
@@ -905,6 +914,10 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
    if format != datapoints.BoundingBoxFormat.XYXY:
        in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
+    expected_bboxes = clamp_bounding_box(
+        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+    ).tolist()
    output_boxes, output_spatial_size = F.crop_bounding_box(
        in_boxes,
        format,
@@ -1121,7 +1134,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
            dtype=bbox.dtype,
            device=bbox.device,
        )
-        return convert_format_bounding_box(out_bbox, new_format=bbox.format)
+        return clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
    spatial_size = (32, 38)
@@ -1134,6 +1147,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
        output_bboxes = F.perspective_bounding_box(
            bboxes.as_subclass(torch.Tensor),
            format=bboxes.format,
+            spatial_size=bboxes.spatial_size,
            startpoints=None,
            endpoints=None,
            coefficients=pcoeffs,
@@ -1178,6 +1192,7 @@ def test_correctness_center_crop_bounding_box(device, output_size):
        ]
        out_bbox = torch.tensor(out_bbox)
        out_bbox = convert_format_bounding_box(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
+        out_bbox = clamp_bounding_box(out_bbox, format=format_, spatial_size=output_size)
        return out_bbox.to(dtype=dtype, device=bbox.device)
    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
@@ -1201,7 +1216,8 @@ def test_correctness_center_crop_bounding_box(device, output_size):
            expected_bboxes = torch.stack(expected_bboxes)
        else:
            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_boxes, expected_bboxes)
+        torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
        torch.testing.assert_close(output_spatial_size, output_size)

--- a/torchvision/prototype/datapoints/_bounding_box.py
+++ b/torchvision/prototype/datapoints/_bounding_box.py
@@ -81,7 +81,10 @@ class BoundingBox(Datapoint):
        antialias: Optional[Union[str, bool]] = "warn",
    ) -> BoundingBox:
        output, spatial_size = self._F.resize_bounding_box(
-            self.as_subclass(torch.Tensor), spatial_size=self.spatial_size, size=size, max_size=max_size
+            self.as_subclass(torch.Tensor),
+            spatial_size=self.spatial_size,
+            size=size,
+            max_size=max_size,
        )
        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
@@ -178,6 +181,7 @@ class BoundingBox(Datapoint):
        output = self._F.perspective_bounding_box(
            self.as_subclass(torch.Tensor),
            format=self.format,
+            spatial_size=self.spatial_size,
            startpoints=startpoints,
            endpoints=endpoints,
            coefficients=coefficients,
@@ -190,5 +194,7 @@ class BoundingBox(Datapoint):
        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
        fill: FillTypeJIT = None,
    ) -> BoundingBox:
-        output = self._F.elastic_bounding_box(self.as_subclass(torch.Tensor), self.format, displacement)
+        output = self._F.elastic_bounding_box(
+            self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement
+        )
        return BoundingBox.wrap_like(self, output)
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -41,7 +41,7 @@ from ._geometry import (
    ScaleJitter,
    TenCrop,
 )
-from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
+from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
 from ._misc import (
    GaussianBlur,
    Identity,

--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -42,7 +42,7 @@ class ConvertDtype(Transform):
 ConvertImageDtype = ConvertDtype
-class ClampBoundingBoxes(Transform):
+class ClampBoundingBox(Transform):
    _transformed_types = (datapoints.BoundingBox,)
    def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:

--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -22,7 +22,7 @@ from torchvision.transforms.functional_tensor import _pad_symmetric
 from torchvision.utils import _log_api_usage_once
-from ._meta import convert_format_bounding_box, get_spatial_size_image_pil
+from ._meta import clamp_bounding_box, convert_format_bounding_box, get_spatial_size_image_pil
 from ._utils import is_simple_tensor
@@ -580,8 +580,9 @@ def affine_image_pil(
    return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
-def _affine_bounding_box_xyxy(
+def _affine_bounding_box_with_expand(
    bounding_box: torch.Tensor,
+    format: datapoints.BoundingBoxFormat,
    spatial_size: Tuple[int, int],
    angle: Union[int, float],
    translate: List[float],
@@ -593,6 +594,17 @@ def _affine_bounding_box_xyxy(
    if bounding_box.numel() == 0:
        return bounding_box, spatial_size
+    original_shape = bounding_box.shape
+    original_dtype = bounding_box.dtype
+    bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
+    dtype = bounding_box.dtype
+    device = bounding_box.device
+    bounding_box = (
+        convert_format_bounding_box(
+            bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+        )
+    ).reshape(-1, 4)
    angle, translate, shear, center = _affine_parse_args(
        angle, translate, scale, shear, InterpolationMode.NEAREST, center
    )
@@ -601,9 +613,6 @@ def _affine_bounding_box_xyxy(
        height, width = spatial_size
        center = [width * 0.5, height * 0.5]
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
    transposed_affine_matrix = (
        torch.tensor(
@@ -651,7 +660,13 @@ def _affine_bounding_box_xyxy(
        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
        spatial_size = (new_height, new_width)
-    return out_bboxes.to(bounding_box.dtype), spatial_size
+    out_bboxes = clamp_bounding_box(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
+    out_bboxes = convert_format_bounding_box(
+        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+    out_bboxes = out_bboxes.to(original_dtype)
+    return out_bboxes, spatial_size
 def affine_bounding_box(
@@ -664,19 +679,18 @@ def affine_bounding_box(
    shear: List[float],
    center: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    original_shape = bounding_box.shape
+    out_box, _ = _affine_bounding_box_with_expand(
+        bounding_box,
-    bounding_box = (
+        format=format,
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
+        spatial_size=spatial_size,
-    ).reshape(-1, 4)
+        angle=angle,
+        translate=translate,
-    out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center)
+        scale=scale,
+        shear=shear,
-    # out_bboxes should be of shape [N boxes, 4]
+        center=center,
+        expand=False,
-    return convert_format_bounding_box(
+    )
-        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    return out_box
-    ).reshape(original_shape)
 def affine_mask(
@@ -852,14 +866,10 @@ def rotate_bounding_box(
        warnings.warn("The provided center argument has no effect on the result if expand is True")
        center = None
-    original_shape = bounding_box.shape
+    return _affine_bounding_box_with_expand(
-    bounding_box = (
-        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
-    ).reshape(-1, 4)
-    out_bboxes, spatial_size = _affine_bounding_box_xyxy(
        bounding_box,
-        spatial_size,
+        format=format,
+        spatial_size=spatial_size,
        angle=-angle,
        translate=[0.0, 0.0],
        scale=1.0,
@@ -868,13 +878,6 @@ def rotate_bounding_box(
        expand=expand,
    )
-    return (
-        convert_format_bounding_box(
-            out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-        ).reshape(original_shape),
-        spatial_size,
-    )
 def rotate_mask(
    mask: torch.Tensor,
@@ -1112,8 +1115,9 @@ def pad_bounding_box(
    height, width = spatial_size
    height += top + bottom
    width += left + right
+    spatial_size = (height, width)
-    return bounding_box, (height, width)
+    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
 def pad_video(
@@ -1185,8 +1189,9 @@ def crop_bounding_box(
        sub = [left, top, 0, 0]
    bounding_box = bounding_box - torch.tensor(sub, dtype=bounding_box.dtype, device=bounding_box.device)
+    spatial_size = (height, width)
-    return bounding_box, (height, width)
+    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1332,6 +1337,7 @@ def perspective_image_pil(
 def perspective_bounding_box(
    bounding_box: torch.Tensor,
    format: datapoints.BoundingBoxFormat,
+    spatial_size: Tuple[int, int],
    startpoints: Optional[List[List[int]]],
    endpoints: Optional[List[List[int]]],
    coefficients: Optional[List[float]] = None,
@@ -1342,6 +1348,7 @@ def perspective_bounding_box(
    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
    original_shape = bounding_box.shape
+    # TODO: first cast to float if bbox is int64 before convert_format_bounding_box
    bounding_box = (
        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
    ).reshape(-1, 4)
@@ -1408,7 +1415,11 @@ def perspective_bounding_box(
    transformed_points = transformed_points.reshape(-1, 4, 2)
    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
+    out_bboxes = clamp_bounding_box(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=spatial_size,
+    )
    # out_bboxes should be of shape [N boxes, 4]
@@ -1549,6 +1560,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
 def elastic_bounding_box(
    bounding_box: torch.Tensor,
    format: datapoints.BoundingBoxFormat,
+    spatial_size: Tuple[int, int],
    displacement: torch.Tensor,
 ) -> torch.Tensor:
    if bounding_box.numel() == 0:
@@ -1562,14 +1574,11 @@ def elastic_bounding_box(
        displacement = displacement.to(dtype=dtype, device=device)
    original_shape = bounding_box.shape
+    # TODO: first cast to float if bbox is int64 before convert_format_bounding_box
    bounding_box = (
        convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
    ).reshape(-1, 4)
-    # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
-    # Or add spatial_size arg and check displacement shape
-    spatial_size = displacement.shape[-3], displacement.shape[-2]
    id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
    # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
    # This is not an exact inverse of the grid
@@ -1588,7 +1597,11 @@ def elastic_bounding_box(
    transformed_points = transformed_points.reshape(-1, 4, 2)
    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
+    out_bboxes = clamp_bounding_box(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
+        format=datapoints.BoundingBoxFormat.XYXY,
+        spatial_size=spatial_size,
+    )
    return convert_format_bounding_box(
        out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
@@ -1796,7 +1809,7 @@ def resized_crop_bounding_box(
    size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
    bounding_box, _ = crop_bounding_box(bounding_box, format, top, left, height, width)
-    return resize_bounding_box(bounding_box, (height, width), size)
+    return resize_bounding_box(bounding_box, spatial_size=(height, width), size=size)
 def resized_crop_mask(

--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -245,12 +245,17 @@ def _clamp_bounding_box(
 ) -> torch.Tensor:
    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
    #  BoundingBoxFormat instead of converting back and forth
+    in_dtype = bounding_box.dtype
+    bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
    xyxy_boxes = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
+        bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
    )
    xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
    xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
-    return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
+    out_boxes = convert_format_bounding_box(
+        xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    )
+    return out_boxes.to(in_dtype)
 def clamp_bounding_box(