port tests for F.perspective and transforms.RandomPerspective (#7943)

d84aaae1 · Philip Meier · GitHub · 7253af58 · d84aaae1 · d84aaae1
Unverified Commit d84aaae1 authored Sep 09, 2023 by Philip Meier Committed by GitHub Sep 09, 2023
8 changed files
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -418,26 +418,6 @@ class TestRandomZoomOut:
        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
-class TestRandomPerspective:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
-            transforms.RandomPerspective(distortion_scale=-1.0)
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomPerspective(0.5, fill="abc")
-    def test__get_params(self):
-        dscale = 0.5
-        transform = transforms.RandomPerspective(dscale)
-        image = make_image((24, 32))
-        params = transform._get_params([image])
-        assert "coefficients" in params
-        assert len(params["coefficients"]) == 8
 class TestElasticTransform:
    def test_assertions(self):

--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -6,7 +6,6 @@ import re
 from pathlib import Path
 import numpy as np
-import PIL.Image
 import pytest
 import torch
@@ -246,20 +245,6 @@ CONSISTENCY_CONFIGS = [
        ],
        closeness_kwargs={"atol": 1e-5, "rtol": 1e-5},
    ),
-    ConsistencyConfig(
-        v2_transforms.RandomPerspective,
-        legacy_transforms.RandomPerspective,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-            ArgsKwargs(p=1, distortion_scale=0.3),
-            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=v2_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=PIL.Image.NEAREST),
-            ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
-            ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
-        ],
-        closeness_kwargs={"atol": None, "rtol": None},
-    ),
    ConsistencyConfig(
        v2_transforms.PILToTensor,
        legacy_transforms.PILToTensor,
@@ -478,7 +463,6 @@ get_params_parametrization = pytest.mark.parametrize(
        )
        for transform_cls, get_params_args_kwargs in [
            (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
-            (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
            (v2_transforms.AutoAugment, ArgsKwargs(5)),
        ]
    ],

--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -9,10 +9,8 @@ import torch
 from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed
 from torch.utils._pytree import tree_map
 from torchvision import tv_tensors
-from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2._utils import is_pure_tensor
-from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_bounding_box_format
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
 from transforms_v2_legacy_utils import (
@@ -523,83 +521,6 @@ class TestClampBoundingBoxes:
 #  `transforms_v2_kernel_infos.py`
-@pytest.mark.parametrize("device", cpu_and_cuda())
-@pytest.mark.parametrize(
-    "startpoints, endpoints",
-    [
-        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
-    ],
-)
-def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
-    def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_):
-        m1 = np.array(
-            [
-                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
-                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
-            ]
-        )
-        m2 = np.array(
-            [
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-            ]
-        )
-        bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=tv_tensors.BoundingBoxFormat.XYXY)
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-            ]
-        )
-        numer = np.matmul(points, m1.T)
-        denom = np.matmul(points, m2.T)
-        transformed_points = numer / denom
-        out_bbox = np.array(
-            [
-                np.min(transformed_points[:, 0]),
-                np.min(transformed_points[:, 1]),
-                np.max(transformed_points[:, 0]),
-                np.max(transformed_points[:, 1]),
-            ]
-        )
-        out_bbox = torch.from_numpy(out_bbox)
-        out_bbox = convert_bounding_box_format(
-            out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_
-        )
-        return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox)
-    canvas_size = (32, 38)
-    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
-    inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
-    for bboxes in make_multiple_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
-        bboxes = bboxes.to(device)
-        output_bboxes = F.perspective_bounding_boxes(
-            bboxes.as_subclass(torch.Tensor),
-            format=bboxes.format,
-            canvas_size=bboxes.canvas_size,
-            startpoints=None,
-            endpoints=None,
-            coefficients=pcoeffs,
-        )
-        expected_bboxes = torch.stack(
-            [
-                _compute_expected_bbox(b, bboxes.format, bboxes.canvas_size, inv_pcoeffs)
-                for b in bboxes.reshape(-1, 4).unbind()
-            ]
-        ).reshape(bboxes.shape)
-        torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=0, atol=1)
 @pytest.mark.parametrize(
    "inpt",
    [

--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -45,6 +45,7 @@ from torchvision import tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs
 from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal
@@ -3430,6 +3431,9 @@ class TestPad:
        with pytest.raises(ValueError, match="Non-scalar fill value is not supported"):
            check_kernel(F.pad_mask, make_segmentation_mask(), padding=[1], fill=fill)
+    def test_kernel_video(self):
+        check_kernel(F.pad_video, make_video(), padding=[1])
    @pytest.mark.parametrize(
        "make_input",
        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
@@ -3633,3 +3637,247 @@ class TestCenterCrop:
        expected = self._reference_center_crop_bounding_boxes(bounding_boxes, output_size)
        assert_equal(actual, expected)
+class TestPerspective:
+    COEFFICIENTS = [
+        [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
+        [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
+    ]
+    START_END_POINTS = [
+        ([[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]),
+        ([[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]),
+        ([[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]),
+    ]
+    MINIMAL_KWARGS = dict(startpoints=None, endpoints=None, coefficients=COEFFICIENTS[0])
+    @param_value_parametrization(
+        coefficients=COEFFICIENTS,
+        start_end_points=START_END_POINTS,
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        if param == "start_end_points":
+            kwargs = dict(zip(["startpoints", "endpoints"], value))
+        else:
+            kwargs = {"startpoints": None, "endpoints": None, param: value}
+        if param == "fill":
+            kwargs["coefficients"] = self.COEFFICIENTS[0]
+        check_kernel(
+            F.perspective_image,
+            make_image(dtype=dtype, device=device),
+            **kwargs,
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+    def test_kernel_image_error(self):
+        image = make_image_tensor()
+        with pytest.raises(ValueError, match="startpoints/endpoints or the coefficients must have non `None` values"):
+            F.perspective_image(image, startpoints=None, endpoints=None)
+        with pytest.raises(
+            ValueError, match="startpoints/endpoints and the coefficients shouldn't be defined concurrently"
+        ):
+            startpoints, endpoints = self.START_END_POINTS[0]
+            coefficients = self.COEFFICIENTS[0]
+            F.perspective_image(image, startpoints=startpoints, endpoints=endpoints, coefficients=coefficients)
+        with pytest.raises(ValueError, match="coefficients should have 8 float values"):
+            F.perspective_image(image, startpoints=None, endpoints=None, coefficients=list(range(7)))
+    @param_value_parametrization(
+        coefficients=COEFFICIENTS,
+        start_end_points=START_END_POINTS,
+    )
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    def test_kernel_bounding_boxes(self, param, value, format):
+        if param == "start_end_points":
+            kwargs = dict(zip(["startpoints", "endpoints"], value))
+        else:
+            kwargs = {"startpoints": None, "endpoints": None, param: value}
+        bounding_boxes = make_bounding_boxes(format=format)
+        check_kernel(
+            F.perspective_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            **kwargs,
+        )
+    def test_kernel_bounding_boxes_error(self):
+        bounding_boxes = make_bounding_boxes()
+        format, canvas_size = bounding_boxes.format, bounding_boxes.canvas_size
+        bounding_boxes = bounding_boxes.as_subclass(torch.Tensor)
+        with pytest.raises(RuntimeError, match="Denominator is zero"):
+            F.perspective_bounding_boxes(
+                bounding_boxes,
+                format=format,
+                canvas_size=canvas_size,
+                startpoints=None,
+                endpoints=None,
+                coefficients=[0.0] * 8,
+            )
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.perspective_mask, make_mask(), **self.MINIMAL_KWARGS)
+    def test_kernel_video(self):
+        check_kernel(F.perspective_video, make_video(), **self.MINIMAL_KWARGS)
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.perspective, make_input(), **self.MINIMAL_KWARGS)
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.perspective_image, torch.Tensor),
+            (F._perspective_image_pil, PIL.Image.Image),
+            (F.perspective_image, tv_tensors.Image),
+            (F.perspective_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.perspective_mask, tv_tensors.Mask),
+            (F.perspective_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.perspective, kernel=kernel, input_type=input_type)
+    @pytest.mark.parametrize("distortion_scale", [0.5, 0.0, 1.0])
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_transform(self, distortion_scale, make_input):
+        check_transform(transforms.RandomPerspective(distortion_scale=distortion_scale, p=1), make_input())
+    @pytest.mark.parametrize("distortion_scale", [-1, 2])
+    def test_transform_error(self, distortion_scale):
+        with pytest.raises(ValueError, match="distortion_scale value should be between 0 and 1"):
+            transforms.RandomPerspective(distortion_scale=distortion_scale)
+    @pytest.mark.parametrize("coefficients", COEFFICIENTS)
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_image_functional_correctness(self, coefficients, interpolation, fill):
+        image = make_image(dtype=torch.uint8, device="cpu")
+        actual = F.perspective(
+            image, startpoints=None, endpoints=None, coefficients=coefficients, interpolation=interpolation, fill=fill
+        )
+        expected = F.to_image(
+            F.perspective(
+                F.to_pil_image(image),
+                startpoints=None,
+                endpoints=None,
+                coefficients=coefficients,
+                interpolation=interpolation,
+                fill=fill,
+            )
+        )
+        if interpolation is transforms.InterpolationMode.BILINEAR:
+            abs_diff = (actual.float() - expected.float()).abs()
+            assert (abs_diff > 1).float().mean() < 7e-2
+            mae = abs_diff.mean()
+            assert mae < 3
+        else:
+            assert_equal(actual, expected)
+    def _reference_perspective_bounding_boxes(self, bounding_boxes, *, startpoints, endpoints):
+        format = bounding_boxes.format
+        canvas_size = bounding_boxes.canvas_size
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+        coefficients = _get_perspective_coeffs(endpoints, startpoints)
+        def perspective_bounding_boxes(bounding_boxes):
+            m1 = np.array(
+                [
+                    [coefficients[0], coefficients[1], coefficients[2]],
+                    [coefficients[3], coefficients[4], coefficients[5]],
+                ]
+            )
+            m2 = np.array(
+                [
+                    [coefficients[6], coefficients[7], 1.0],
+                    [coefficients[6], coefficients[7], 1.0],
+                ]
+            )
+            # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+            input_xyxy = F.convert_bounding_box_format(
+                bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+                old_format=format,
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+                inplace=True,
+            )
+            x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
+            points = np.array(
+                [
+                    [x1, y1, 1.0],
+                    [x2, y1, 1.0],
+                    [x1, y2, 1.0],
+                    [x2, y2, 1.0],
+                ]
+            )
+            numerator = points @ m1.T
+            denominator = points @ m2.T
+            transformed_points = numerator / denominator
+            output_xyxy = torch.Tensor(
+                [
+                    float(np.min(transformed_points[:, 0])),
+                    float(np.min(transformed_points[:, 1])),
+                    float(np.max(transformed_points[:, 0])),
+                    float(np.max(transformed_points[:, 1])),
+                ]
+            )
+            output = F.convert_bounding_box_format(
+                output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format
+            )
+            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            return F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+            ).to(dtype=dtype, device=device)
+        return tv_tensors.BoundingBoxes(
+            torch.cat([perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
+                bounding_boxes.shape
+            ),
+            format=format,
+            canvas_size=canvas_size,
+        )
+    @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_correctness_perspective_bounding_boxes(self, startpoints, endpoints, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        actual = F.perspective(bounding_boxes, startpoints=startpoints, endpoints=endpoints)
+        expected = self._reference_perspective_bounding_boxes(
+            bounding_boxes, startpoints=startpoints, endpoints=endpoints
+        )
+        assert_close(actual, expected, rtol=0, atol=1)
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -111,19 +111,6 @@ multi_crop_skips.append(skip_dispatch_tv_tensor)
 DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.perspective,
-        kernels={
-            tv_tensors.Image: F.perspective_image,
-            tv_tensors.Video: F.perspective_video,
-            tv_tensors.BoundingBoxes: F.perspective_bounding_boxes,
-            tv_tensors.Mask: F.perspective_mask,
-        },
-        pil_kernel_info=PILKernelInfo(F._perspective_image_pil),
-        test_marks=[
-            xfail_jit_python_scalar_arg("fill"),
-        ],
-    ),
    DispatcherInfo(
        F.elastic,
        kernels={

--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -5,23 +5,18 @@ import PIL.Image
 import pytest
 import torch.testing
 import torchvision.transforms.v2.functional as F
-from torchvision import tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from transforms_v2_legacy_utils import (  # noqa: F401
+from transforms_v2_legacy_utils import (
    ArgsKwargs,
-    combinations_grid,
    DEFAULT_PORTRAIT_SPATIAL_SIZE,
    get_num_channels,
    ImageLoader,
    InfoBase,
-    make_bounding_box_loader,
    make_bounding_box_loaders,
-    make_detection_mask_loader,
    make_image_loader,
    make_image_loaders,
    make_image_loaders_for_interpolation,
    make_mask_loaders,
-    make_video_loader,
    make_video_loaders,
    mark_framework_limitation,
    TestMark,
@@ -182,135 +177,6 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
    return other_args, dict(kwargs, fill=fill)
-_PERSPECTIVE_COEFFS = [
-    [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
-    [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
-]
-_STARTPOINTS = [[0, 1], [2, 3], [4, 5], [6, 7]]
-_ENDPOINTS = [[9, 8], [7, 6], [5, 4], [3, 2]]
-def sample_inputs_perspective_image_tensor():
-    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
-        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
-            yield ArgsKwargs(
-                image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]
-            )
-    yield ArgsKwargs(make_image_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
-def reference_inputs_perspective_image_tensor():
-    for image_loader, coefficients, interpolation in itertools.product(
-        make_image_loaders_for_interpolation(),
-        _PERSPECTIVE_COEFFS,
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-        ],
-    ):
-        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
-            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
-            if isinstance(fill, (list, tuple)):
-                continue
-            yield ArgsKwargs(
-                image_loader,
-                startpoints=None,
-                endpoints=None,
-                interpolation=interpolation,
-                fill=fill,
-                coefficients=coefficients,
-            )
-def sample_inputs_perspective_bounding_boxes():
-    for bounding_boxes_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_boxes_loader,
-            format=bounding_boxes_loader.format,
-            canvas_size=bounding_boxes_loader.canvas_size,
-            startpoints=None,
-            endpoints=None,
-            coefficients=_PERSPECTIVE_COEFFS[0],
-        )
-    format = tv_tensors.BoundingBoxFormat.XYXY
-    loader = make_bounding_box_loader(format=format)
-    yield ArgsKwargs(
-        loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
-    )
-def sample_inputs_perspective_mask():
-    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
-        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
-    yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
-def reference_inputs_perspective_mask():
-    for mask_loader, perspective_coeffs in itertools.product(
-        make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
-    ):
-        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=perspective_coeffs)
-def sample_inputs_perspective_video():
-    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
-        yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
-    yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.perspective_image,
-            sample_inputs_fn=sample_inputs_perspective_image_tensor,
-            reference_fn=pil_reference_wrapper(F._perspective_image_pil),
-            reference_inputs_fn=reference_inputs_perspective_image_tensor,
-            float32_vs_uint8=float32_vs_uint8_fill_adapter,
-            closeness_kwargs={
-                **pil_reference_pixel_difference(2, mae=True),
-                **cuda_vs_cpu_pixel_difference(),
-                **float32_vs_uint8_pixel_difference(),
-                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
-            },
-            test_marks=[xfail_jit_python_scalar_arg("fill")],
-        ),
-        KernelInfo(
-            F.perspective_bounding_boxes,
-            sample_inputs_fn=sample_inputs_perspective_bounding_boxes,
-            closeness_kwargs={
-                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
-                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
-            },
-        ),
-        KernelInfo(
-            F.perspective_mask,
-            sample_inputs_fn=sample_inputs_perspective_mask,
-            reference_fn=pil_reference_wrapper(F._perspective_image_pil),
-            reference_inputs_fn=reference_inputs_perspective_mask,
-            float32_vs_uint8=True,
-            closeness_kwargs={
-                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=10, rtol=0),
-            },
-        ),
-        KernelInfo(
-            F.perspective_video,
-            sample_inputs_fn=sample_inputs_perspective_video,
-            closeness_kwargs={
-                **cuda_vs_cpu_pixel_difference(),
-                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
-                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
-            },
-        ),
-    ]
-)
 def _get_elastic_displacement(canvas_size):
    return torch.rand(1, *canvas_size, 2)

--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -1008,8 +1008,8 @@ class RandomPerspective(_RandomApplyTransform):
        return self._call_kernel(
            F.perspective,
            inpt,
-            None,
+            startpoints=None,
-            None,
+            endpoints=None,
            fill=fill,
            interpolation=self.interpolation,
            **params,

--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -1552,7 +1552,7 @@ def _perspective_image_pil(
    image: PIL.Image.Image,
    startpoints: Optional[List[List[int]]],
    endpoints: Optional[List[List[int]]],
-    interpolation: Union[InterpolationMode, int] = InterpolationMode.BICUBIC,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
    fill: _FillTypeJIT = None,
    coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image: