fix padding for degenerate segmentation masks (#6542)

* fix padding for degenerate segmentation masks * extend test data degeneration to degenerate inputs * add even more degenerate shapes * simplify kernel * [SKIP CI] only GHA * add more degenerate segmentation masks * fix segmentation mask generation * xfail some tests * Revert "simplify kernel" This reverts commit 18c5e4fc59de86fdfd304bf0c0a988b410b81fd9. * fix resize for degenerate inputs * [SKIP CI] CircleCI * fix RandomIoUCrop test * [SKIP CI] CircleCI * cleanup * [SKIP CI] CircleCI * add perf TODO comments * [SKIP CI] CircleCI

fix padding for degenerate segmentation masks (#6542)
* fix padding for degenerate segmentation masks * extend test data degeneration to degenerate inputs * add even more degenerate shapes * simplify kernel * [SKIP CI] only GHA * add more degenerate segmentation masks * fix segmentation mask generation * xfail some tests * Revert "simplify kernel" This reverts commit 18c5e4fc59de86fdfd304bf0c0a988b410b81fd9. * fix resize for degenerate inputs * [SKIP CI] CircleCI * fix RandomIoUCrop test * [SKIP CI] CircleCI * cleanup * [SKIP CI] CircleCI * add perf TODO comments * [SKIP CI] CircleCI
84dcf695 · Philip Meier · GitHub · 4c073b09 · 84dcf695 · 84dcf695
Unverified Commit 84dcf695 authored Sep 07, 2022 by Philip Meier Committed by GitHub Sep 07, 2022
3 changed files
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -86,8 +86,6 @@ class TestSmoke:
        transforms.RandomHorizontalFlip(),
        transforms.Pad(5),
        transforms.RandomZoomOut(),
-        transforms.RandomRotation(degrees=(-45, 45)),
-        transforms.RandomAffine(degrees=(-45, 45)),
        transforms.RandomCrop([16, 16], padding=1, pad_if_needed=True),
        # TODO: Something wrong with input data setup. Let's fix that
        # transforms.RandomEqualize(),
@@ -95,6 +93,8 @@ class TestSmoke:
        # transforms.RandomPosterize(bits=4),
        # transforms.RandomSolarize(threshold=0.5),
        # transforms.RandomAdjustSharpness(sharpness_factor=0.5),
+        # transforms.RandomRotation(degrees=(-45, 45)),
+        # transforms.RandomAffine(degrees=(-45, 45)),
    )
    def test_common(self, transform, input):
        transform(input)
@@ -1206,9 +1206,9 @@ class TestRandomIoUCrop:
        bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,))
        label = features.Label(torch.randint(0, 10, size=(6,)))
        ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
-        masks = make_segmentation_mask((32, 24))
+        masks = make_segmentation_mask((32, 24), num_objects=6)
-        ohe_masks = features.SegmentationMask(torch.randint(0, 2, size=(6, 32, 24)))
-        sample = [image, bboxes, label, ohe_label, masks, ohe_masks]
+        sample = [image, bboxes, label, ohe_label, masks]
        fn = mocker.patch("torchvision.prototype.transforms.functional.crop", side_effect=lambda x, **params: x)
        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
@@ -1217,15 +1217,12 @@ class TestRandomIoUCrop:
        transform._get_params = mocker.MagicMock(return_value=params)
        output = transform(sample)
-        assert fn.call_count == 4
+        assert fn.call_count == 3
        expected_calls = [
            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(
-                ohe_masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
-            ),
        ]
        fn.assert_has_calls(expected_calls)
@@ -1249,11 +1246,7 @@ class TestRandomIoUCrop:
        output_masks = output[4]
        assert isinstance(output_masks, features.SegmentationMask)
-        assert output_masks.shape[:-2] == masks.shape[:-2]
+        assert len(output_masks) == expected_within_targets
-        output_ohe_masks = output[5]
-        assert isinstance(output_ohe_masks, features.SegmentationMask)
-        assert len(output_ohe_masks) == expected_within_targets
 class TestScaleJitter:

--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -54,7 +54,7 @@ def make_images(
        features.ColorSpace.RGB_ALPHA,
    ),
    dtypes=(torch.float32, torch.uint8),
-    extra_dims=((4,), (2, 3)),
+    extra_dims=((), (0,), (4,), (2, 3), (5, 0), (0, 5)),
 ):
    for size, color_space, dtype in itertools.product(sizes, color_spaces, dtypes):
        yield make_image(size, color_space=color_space, dtype=dtype)
@@ -79,6 +79,9 @@ def make_bounding_box(*, format, image_size=(32, 32), extra_dims=(), dtype=torch
    if isinstance(format, str):
        format = features.BoundingBoxFormat[format]
+    if any(dim == 0 for dim in extra_dims):
+        return features.BoundingBox(torch.empty(*extra_dims, 4), format=format, image_size=image_size)
    height, width = image_size
    if format == features.BoundingBoxFormat.XYXY:
@@ -112,7 +115,7 @@ def make_bounding_boxes(
    formats=(features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH),
    image_sizes=((32, 32),),
    dtypes=(torch.int64, torch.float32),
-    extra_dims=((4,), (2, 3)),
+    extra_dims=((0,), (), (4,), (2, 3), (5, 0), (0, 5)),
 ):
    for format, image_size, dtype in itertools.product(formats, image_sizes, dtypes):
        yield make_bounding_box(format=format, image_size=image_size, dtype=dtype)
@@ -133,7 +136,7 @@ def make_one_hot_label(*args, **kwargs):
 def make_one_hot_labels(
    *,
    num_categories=(1, 2, 10),
-    extra_dims=((4,), (2, 3)),
+    extra_dims=((), (0,), (4,), (2, 3), (5, 0), (0, 5)),
 ):
    for num_categories_ in num_categories:
        yield make_one_hot_label(categories=[f"category{idx}" for idx in range(num_categories_)])
@@ -142,21 +145,26 @@ def make_one_hot_labels(
        yield make_one_hot_label(extra_dims_)
-def make_segmentation_mask(size=None, *, num_categories=80, extra_dims=(), dtype=torch.long):
+def make_segmentation_mask(size=None, *, num_objects=None, extra_dims=(), dtype=torch.uint8):
-    size = size or torch.randint(16, 33, (2,)).tolist()
+    size = size if size is not None else torch.randint(16, 33, (2,)).tolist()
-    shape = (*extra_dims, 1, *size)
+    num_objects = num_objects if num_objects is not None else int(torch.randint(1, 11, ()))
-    data = make_tensor(shape, low=0, high=num_categories, dtype=dtype)
+    shape = (*extra_dims, num_objects, *size)
+    data = make_tensor(shape, low=0, high=2, dtype=dtype)
    return features.SegmentationMask(data)
 def make_segmentation_masks(
    sizes=((16, 16), (7, 33), (31, 9)),
-    dtypes=(torch.long,),
+    dtypes=(torch.uint8,),
-    extra_dims=((), (4,), (2, 3)),
+    extra_dims=((), (0,), (4,), (2, 3), (5, 0), (0, 5)),
+    num_objects=(1, 0, 10),
 ):
    for size, dtype, extra_dims_ in itertools.product(sizes, dtypes, extra_dims):
        yield make_segmentation_mask(size=size, dtype=dtype, extra_dims=extra_dims_)
+    for dtype, extra_dims_, num_objects_ in itertools.product(dtypes, extra_dims, num_objects):
+        yield make_segmentation_mask(num_objects=num_objects_, dtype=dtype, extra_dims=extra_dims_)
 class SampleInput:
    def __init__(self, *args, **kwargs):
@@ -321,7 +329,7 @@ def affine_bounding_box():
 @register_kernel_info_from_sample_inputs_fn
 def affine_segmentation_mask():
    for mask, angle, translate, scale, shear in itertools.product(
-        make_segmentation_masks(extra_dims=((), (4,))),
+        make_segmentation_masks(extra_dims=((), (4,)), num_objects=[10]),
        [-87, 15, 90],  # angle
        [5, -5],  # translate
        [0.77, 1.27],  # scale
@@ -374,7 +382,7 @@ def rotate_bounding_box():
 @register_kernel_info_from_sample_inputs_fn
 def rotate_segmentation_mask():
    for mask, angle, expand, center in itertools.product(
-        make_segmentation_masks(extra_dims=((), (4,))),
+        make_segmentation_masks(extra_dims=((), (4,)), num_objects=[10]),
        [-87, 15, 90],  # angle
        [True, False],  # expand
        [None, [12, 23]],  # center
@@ -896,6 +904,13 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
+incorrect_expected_segmentation_mask_setup = pytest.mark.xfail(
+    reason="This test fails because the expected result computation is wrong. Fix ASAP.",
+    strict=False,
+)
+@incorrect_expected_segmentation_mask_setup
 @pytest.mark.parametrize("angle", [-54, 56])
 @pytest.mark.parametrize("translate", [-7, 8])
 @pytest.mark.parametrize("scale", [0.89, 1.12])
@@ -1113,6 +1128,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
+@incorrect_expected_segmentation_mask_setup
 @pytest.mark.parametrize("angle", range(-90, 90, 37))
 @pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
 def test_correctness_rotate_segmentation_mask(angle, expand, center):
@@ -1428,7 +1444,7 @@ def test_correctness_pad_bounding_box(device, padding):
        output_boxes = F.pad_bounding_box(bboxes, padding, format=bboxes_format)
-        if bboxes.ndim < 2:
+        if bboxes.ndim < 2 or bboxes.shape[0] == 0:
            bboxes = [bboxes]
        expected_bboxes = []
@@ -1601,6 +1617,7 @@ def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
        torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=1e-5, atol=1e-5)
+@incorrect_expected_segmentation_mask_setup
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize(
    "startpoints, endpoints",
@@ -1802,6 +1819,7 @@ def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, s
    torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
+@incorrect_expected_segmentation_mask_setup
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize(
    "fn, make_samples", [(F.elastic_image_tensor, make_images), (F.elastic_segmentation_mask, make_segmentation_masks)]

--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -105,13 +105,20 @@ def resize_image_tensor(
 ) -> torch.Tensor:
    num_channels, old_height, old_width = get_dimensions_image_tensor(image)
    new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
-    batch_shape = image.shape[:-3]
+    extra_dims = image.shape[:-3]
-    return _FT.resize(
-        image.reshape((-1, num_channels, old_height, old_width)),
+    if image.numel() > 0:
-        size=[new_height, new_width],
+        resized_image = _FT.resize(
-        interpolation=interpolation.value,
+            image.view(-1, num_channels, old_height, old_width),
-        antialias=antialias,
+            size=[new_height, new_width],
-    ).reshape(batch_shape + (num_channels, new_height, new_width))
+            interpolation=interpolation.value,
+            antialias=antialias,
+        )
+    else:
+        # TODO: the cloning is probably unnecessary. Review this together with the other perf candidates
+        resized_image = image.clone()
+    return resized_image.view(extra_dims + (num_channels, new_height, new_width))
 def resize_image_pil(
@@ -550,11 +557,18 @@ def pad_image_tensor(
    num_channels, height, width = img.shape[-3:]
    extra_dims = img.shape[:-3]
-    padded_image = _FT.pad(
+    left, right, top, bottom = _FT._parse_pad_padding(padding)
-        img=img.view(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
+    new_height = height + top + bottom
-    )
+    new_width = width + left + right
+    if img.numel() > 0:
+        padded_image = _FT.pad(
+            img=img.view(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
+        )
+    else:
+        # TODO: the cloning is probably unnecessary. Review this together with the other perf candidates
+        padded_image = img.clone()
-    new_height, new_width = padded_image.shape[-2:]
    return padded_image.view(extra_dims + (num_channels, new_height, new_width))
@@ -586,15 +600,7 @@ def _pad_with_vector_fill(
 def pad_segmentation_mask(
    segmentation_mask: torch.Tensor, padding: Union[int, List[int]], padding_mode: str = "constant"
 ) -> torch.Tensor:
-    num_masks, height, width = segmentation_mask.shape[-3:]
+    return pad_image_tensor(img=segmentation_mask, padding=padding, fill=0, padding_mode=padding_mode)
-    extra_dims = segmentation_mask.shape[:-3]
-    padded_mask = pad_image_tensor(
-        img=segmentation_mask.view(-1, num_masks, height, width), padding=padding, fill=0, padding_mode=padding_mode
-    )
-    new_height, new_width = padded_mask.shape[-2:]
-    return padded_mask.view(extra_dims + (num_masks, new_height, new_width))
 def pad_bounding_box(