Allow v2 Resize to resize longer edge exactly to `max_size` (#8459)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>

Allow v2 Resize to resize longer edge exactly to `max_size` (#8459)
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
7896ffd9 · Siddarth Ijju · GitHub · 10239873 · 7896ffd9 · 7896ffd9
Unverified Commit 7896ffd9 authored Jun 05, 2024 by Siddarth Ijju Committed by GitHub Jun 05, 2024
4 changed files
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -99,7 +99,7 @@ def _script(obj):
        return torch.jit.script(obj)
    except Exception as error:
        name = getattr(obj, "__name__", obj.__class__.__name__)
-        raise AssertionError(f"Trying to `torch.jit.script` '{name}' raised the error above.") from error
+        raise AssertionError(f"Trying to `torch.jit.script` `{name}` raised the error above.") from error
 def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs):
@@ -553,10 +553,12 @@ def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new
 class TestResize:
    INPUT_SIZE = (17, 11)
-    OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)]
+    OUTPUT_SIZES = [17, [17], (17,), None, [12, 13], (12, 13)]
    def _make_max_size_kwarg(self, *, use_max_size, size):
-        if use_max_size:
+        if size is None:
+            max_size = min(list(self.INPUT_SIZE))
+        elif use_max_size:
            if not (isinstance(size, int) or len(size) == 1):
                # This would result in an `ValueError`
                return None
@@ -568,10 +570,13 @@ class TestResize:
        return dict(max_size=max_size)
    def _compute_output_size(self, *, input_size, size, max_size):
-        if not (isinstance(size, int) or len(size) == 1):
+        if size is None:
+            size = max_size
+        elif not (isinstance(size, int) or len(size) == 1):
            return tuple(size)
-        if not isinstance(size, int):
+        elif not isinstance(size, int):
            size = size[0]
        old_height, old_width = input_size
@@ -658,10 +663,13 @@ class TestResize:
        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
    )
    def test_functional(self, size, make_input):
+        max_size_kwarg = self._make_max_size_kwarg(use_max_size=size is None, size=size)
        check_functional(
            F.resize,
            make_input(self.INPUT_SIZE),
            size=size,
+            **max_size_kwarg,
            antialias=True,
            check_scripted_smoke=not isinstance(size, int),
        )
@@ -695,11 +703,13 @@ class TestResize:
        ],
    )
    def test_transform(self, size, device, make_input):
+        max_size_kwarg = self._make_max_size_kwarg(use_max_size=size is None, size=size)
        check_transform(
-            transforms.Resize(size=size, antialias=True),
+            transforms.Resize(size=size, **max_size_kwarg, antialias=True),
            make_input(self.INPUT_SIZE, device=device),
            # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
-            check_v1_compatibility=dict(rtol=0, atol=1),
+            check_v1_compatibility=dict(rtol=0, atol=1) if size is not None else False,
        )
    def _check_output_size(self, input, output, *, size, max_size):
@@ -801,7 +811,11 @@ class TestResize:
        ],
    )
    def test_max_size_error(self, size, make_input):
-        if isinstance(size, int) or len(size) == 1:
+        if size is None:
+            # value can be anything other than an integer
+            max_size = None
+            match = "max_size must be an integer when size is None"
+        elif isinstance(size, int) or len(size) == 1:
            max_size = (size if isinstance(size, int) else size[0]) - 1
            match = "must be strictly greater than the requested size"
        else:
@@ -812,6 +826,37 @@ class TestResize:
        with pytest.raises(ValueError, match=match):
            F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
+        if isinstance(size, list) and len(size) != 1:
+            with pytest.raises(ValueError, match="max_size should only be passed if size is None or specifies"):
+                F.resize(make_input(self.INPUT_SIZE), size=size, max_size=500)
+    @pytest.mark.parametrize(
+        "input_size, max_size, expected_size",
+        [
+            ((10, 10), 10, (10, 10)),
+            ((10, 20), 40, (20, 40)),
+            ((20, 10), 40, (40, 20)),
+            ((10, 20), 10, (5, 10)),
+            ((20, 10), 10, (10, 5)),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_masks,
+            make_video,
+        ],
+    )
+    def test_resize_size_none(self, input_size, max_size, expected_size, make_input):
+        img = make_input(input_size)
+        out = F.resize(img, size=None, max_size=max_size)
+        assert F.get_size(out)[-2:] == list(expected_size)
    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
    @pytest.mark.parametrize(
        "make_input",
@@ -834,7 +879,7 @@ class TestResize:
        assert_equal(actual, expected)
    def test_transform_unknown_size_error(self):
-        with pytest.raises(ValueError, match="size can either be an integer or a sequence of one or two integers"):
+        with pytest.raises(ValueError, match="size can be an integer, a sequence of one or two integers, or None"):
            transforms.Resize(size=object())
    @pytest.mark.parametrize(

--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -351,13 +351,22 @@ def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool
 def _compute_resized_output_size(
-    image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    image_size: Tuple[int, int],
+    size: Optional[List[int]],
+    max_size: Optional[int] = None,
+    allow_size_none: bool = False,  # only True in v2
 ) -> List[int]:
-    if len(size) == 1:  # specified size only for the smallest edge
    h, w = image_size
    short, long = (w, h) if w <= h else (h, w)
+    if size is None:
+        if not allow_size_none:
+            raise ValueError("This should never happen!!")
+        if not isinstance(max_size, int):
+            raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
+        new_short, new_long = int(max_size * short / long), max_size
+        new_w, new_h = (new_short, new_long) if w <= h else (new_long, new_short)
+    elif len(size) == 1:  # specified size only for the smallest edge
        requested_new_short = size if isinstance(size, int) else size[0]
        new_short, new_long = requested_new_short, int(requested_new_short * long / short)
        if max_size is not None:

--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -75,11 +75,15 @@ class Resize(Transform):
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
    Args:
-        size (sequence or int): Desired output size. If size is a sequence like
+        size (sequence, int, or None): Desired
-            (h, w), output size will be matched to this. If size is an int,
+            output size.
-            smaller edge of the image will be matched to this number.
-            i.e, if height > width, then image will be rescaled to
+            - If size is a sequence like (h, w), output size will be matched to this.
+            - If size is an int, smaller edge of the image will be matched to this
+              number.  i.e, if height > width, then image will be rescaled to
              (size * height / width, size).
+            - If size is None, the output shape is determined by the ``max_size``
+              parameter.
            .. note::
                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
@@ -89,13 +93,21 @@ class Resize(Transform):
            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image. If the longer edge of the image is greater
+            the resized image.
+            - If ``size`` is an int: if the longer edge of the image is greater
              than ``max_size`` after being resized according to ``size``,
              ``size`` will be overruled so that the longer edge is equal to
-            ``max_size``.
+              ``max_size``. As a result, the smaller edge may be shorter than
-            As a result, the smaller edge may be shorter than ``size``. This
+              ``size``. This is only supported if ``size`` is an int (or a
-            is only supported if ``size`` is an int (or a sequence of length
+              sequence of length 1 in torchscript mode).
-            1 in torchscript mode).
+            - If ``size`` is None: the longer edge of the image will be matched
+              to max_size.  i.e, if height > width, then image will be rescaled
+              to (max_size, max_size * width / height).
+            This should be left to ``None`` (default) when ``size`` is a
+            sequence.
        antialias (bool, optional): Whether to apply antialiasing.
            It only affects **tensors** with bilinear or bicubic modes and it is
            ignored otherwise: on PIL images, antialiasing is always applied on
@@ -120,7 +132,7 @@ class Resize(Transform):
    def __init__(
        self,
-        size: Union[int, Sequence[int]],
+        size: Union[int, Sequence[int], None],
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
        max_size: Optional[int] = None,
        antialias: Optional[bool] = True,
@@ -131,9 +143,12 @@ class Resize(Transform):
            size = [size]
        elif isinstance(size, Sequence) and len(size) in {1, 2}:
            size = list(size)
+        elif size is None:
+            if not isinstance(max_size, int):
+                raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
        else:
            raise ValueError(
-                f"size can either be an integer or a sequence of one or two integers, but got {size} instead."
+                f"size can be an integer, a sequence of one or two integers, or None, but got {size} instead."
            )
        self.size = size

--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -159,21 +159,21 @@ vflip = vertical_flip
 def _compute_resized_output_size(
-    canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    canvas_size: Tuple[int, int], size: Optional[List[int]], max_size: Optional[int] = None
 ) -> List[int]:
    if isinstance(size, int):
        size = [size]
-    elif max_size is not None and len(size) != 1:
+    elif max_size is not None and size is not None and len(size) != 1:
        raise ValueError(
-            "max_size should only be passed if size specifies the length of the smaller edge, "
+            "max_size should only be passed if size is None or specifies the length of the smaller edge, "
            "i.e. size should be an int or a sequence of length 1 in torchscript mode."
        )
-    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size, allow_size_none=True)
 def resize(
    inpt: torch.Tensor,
-    size: List[int],
+    size: Optional[List[int]],
    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
    max_size: Optional[int] = None,
    antialias: Optional[bool] = True,
@@ -206,7 +206,7 @@ def _do_native_uint8_resize_on_cpu(interpolation: InterpolationMode) -> bool:
 @_register_kernel_internal(resize, tv_tensors.Image)
 def resize_image(
    image: torch.Tensor,
-    size: List[int],
+    size: Optional[List[int]],
    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
    max_size: Optional[int] = None,
    antialias: Optional[bool] = True,
@@ -310,7 +310,7 @@ def __resize_image_pil_dispatch(
    return _resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
-def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = None) -> torch.Tensor:
+def resize_mask(mask: torch.Tensor, size: Optional[List[int]], max_size: Optional[int] = None) -> torch.Tensor:
    if mask.ndim < 3:
        mask = mask.unsqueeze(0)
        needs_squeeze = True
@@ -334,7 +334,10 @@ def _resize_mask_dispatch(
 def resize_bounding_boxes(
-    bounding_boxes: torch.Tensor, canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    bounding_boxes: torch.Tensor,
+    canvas_size: Tuple[int, int],
+    size: Optional[List[int]],
+    max_size: Optional[int] = None,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
    old_height, old_width = canvas_size
    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
@@ -353,7 +356,7 @@ def resize_bounding_boxes(
 @_register_kernel_internal(resize, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
 def _resize_bounding_boxes_dispatch(
-    inpt: tv_tensors.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+    inpt: tv_tensors.BoundingBoxes, size: Optional[List[int]], max_size: Optional[int] = None, **kwargs: Any
 ) -> tv_tensors.BoundingBoxes:
    output, canvas_size = resize_bounding_boxes(
        inpt.as_subclass(torch.Tensor), inpt.canvas_size, size, max_size=max_size
@@ -364,7 +367,7 @@ def _resize_bounding_boxes_dispatch(
 @_register_kernel_internal(resize, tv_tensors.Video)
 def resize_video(
    video: torch.Tensor,
-    size: List[int],
+    size: Optional[List[int]],
    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
    max_size: Optional[int] = None,
    antialias: Optional[bool] = True,