_geometry.py 66 KB
Newer Older
1
2
3
import math
import numbers
import warnings
4
from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
5
6
7
8

import PIL.Image
import torch

9
from torchvision import transforms as _transforms, tv_tensors
10
11
12
from torchvision.ops.boxes import box_iou
from torchvision.transforms.functional import _get_perspective_coeffs
from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
13
from torchvision.transforms.v2.functional._utils import _FillType
14
15
16
17
18
19

from ._transform import _RandomApplyTransform
from ._utils import (
    _check_padding_arg,
    _check_padding_mode_arg,
    _check_sequence_input,
20
    _get_fill,
21
22
    _setup_angle,
    _setup_fill_arg,
23
    _setup_number_or_seq,
24
    _setup_size,
Nicolas Hug's avatar
Nicolas Hug committed
25
26
27
28
29
    get_bounding_boxes,
    has_all,
    has_any,
    is_pure_tensor,
    query_size,
30
31
32
33
)


class RandomHorizontalFlip(_RandomApplyTransform):
34
    """Horizontally flip the input with a given probability.
35

36
37
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
38
39
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
40
41

    Args:
42
        p (float, optional): probability of the input being flipped. Default value is 0.5
43
44
    """

45
46
47
    _v1_transform_cls = _transforms.RandomHorizontalFlip

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
48
        return self._call_kernel(F.horizontal_flip, inpt)
49
50
51


class RandomVerticalFlip(_RandomApplyTransform):
52
    """Vertically flip the input with a given probability.
53

54
55
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
56
57
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
58
59

    Args:
60
        p (float, optional): probability of the input being flipped. Default value is 0.5
61
62
    """

63
64
65
    _v1_transform_cls = _transforms.RandomVerticalFlip

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
66
        return self._call_kernel(F.vertical_flip, inpt)
67
68
69


class Resize(Transform):
70
    """Resize the input to the given size.
71

72
73
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
74
75
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
76
77

    Args:
78
79
80
81
82
83
84
85
86
        size (sequence, int, or None): Desired
            output size.

            - If size is a sequence like (h, w), output size will be matched to this.
            - If size is an int, smaller edge of the image will be matched to this
              number.  i.e, if height > width, then image will be rescaled to
              (size * height / width, size).
            - If size is None, the output shape is determined by the ``max_size``
              parameter.
87
88
89

            .. note::
                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
90
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
91
92
93
94
95
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        max_size (int, optional): The maximum allowed for the longer edge of
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
            the resized image.

            - If ``size`` is an int: if the longer edge of the image is greater
              than ``max_size`` after being resized according to ``size``,
              ``size`` will be overruled so that the longer edge is equal to
              ``max_size``. As a result, the smaller edge may be shorter than
              ``size``. This is only supported if ``size`` is an int (or a
              sequence of length 1 in torchscript mode).
            - If ``size`` is None: the longer edge of the image will be matched
              to max_size.  i.e, if height > width, then image will be rescaled
              to (max_size, max_size * width / height).

            This should be left to ``None`` (default) when ``size`` is a
            sequence.

111
112
113
114
115
116
117
        antialias (bool, optional): Whether to apply antialiasing.
            It only affects **tensors** with bilinear or bicubic modes and it is
            ignored otherwise: on PIL images, antialiasing is always applied on
            bilinear or bicubic modes; on other modes (for PIL images and
            tensors), antialiasing makes no sense and this parameter is ignored.
            Possible values are:

118
            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
119
120
121
122
123
124
125
126
              Other mode aren't affected. This is probably what you want to use.
            - ``False``: will not apply antialiasing for tensors on any mode. PIL
              images are still antialiased on bilinear or bicubic modes, because
              PIL doesn't support no antialias.
            - ``None``: equivalent to ``False`` for tensors and ``True`` for
              PIL images. This value exists for legacy reasons and you probably
              don't want to use it unless you really know what you are doing.

127
128
            The default value changed from ``None`` to ``True`` in
            v0.17, for the PIL and Tensor backends to be consistent.
129
130
    """

131
132
133
134
    _v1_transform_cls = _transforms.Resize

    def __init__(
        self,
135
        size: Union[int, Sequence[int], None],
136
137
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
        max_size: Optional[int] = None,
138
        antialias: Optional[bool] = True,
139
140
141
142
143
    ) -> None:
        super().__init__()

        if isinstance(size, int):
            size = [size]
144
        elif isinstance(size, Sequence) and len(size) in {1, 2}:
145
            size = list(size)
146
147
148
        elif size is None:
            if not isinstance(max_size, int):
                raise ValueError(f"max_size must be an integer when size is None, but got {max_size} instead.")
149
150
        else:
            raise ValueError(
151
                f"size can be an integer, a sequence of one or two integers, or None, but got {size} instead."
152
153
154
            )
        self.size = size

155
        self.interpolation = interpolation
156
157
158
159
        self.max_size = max_size
        self.antialias = antialias

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
160
161
        return self._call_kernel(
            F.resize,
162
163
164
165
166
167
168
169
170
            inpt,
            self.size,
            interpolation=self.interpolation,
            max_size=self.max_size,
            antialias=self.antialias,
        )


class CenterCrop(Transform):
171
    """Crop the input at the center.
172

173
174
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
175
176
177
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

178
179
180
181
182
183
184
185
    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.

    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
    """

186
187
188
189
190
191
192
    _v1_transform_cls = _transforms.CenterCrop

    def __init__(self, size: Union[int, Sequence[int]]):
        super().__init__()
        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
193
        return self._call_kernel(F.center_crop, inpt, output_size=self.size)
194
195
196


class RandomResizedCrop(Transform):
197
    """Crop a random portion of the input and resize it to a given size.
198

199
200
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
201
202
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
203

204
    A crop of the original input is made: the crop has a random area (H * W)
205
206
207
208
209
210
211
212
213
214
    and a random aspect ratio. This crop is finally resized to the given
    size. This is popularly used to train the Inception networks.

    Args:
        size (int or sequence): expected output size of the crop, for each edge. If size is an
            int instead of sequence like (h, w), a square output size ``(size, size)`` is
            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).

            .. note::
                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
215
        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
216
            before resizing. The scale is defined with respect to the area of the original image.
217
        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
218
            resizing.
219
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
220
221
222
223
224
225
226
227
228
229
230
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        antialias (bool, optional): Whether to apply antialiasing.
            It only affects **tensors** with bilinear or bicubic modes and it is
            ignored otherwise: on PIL images, antialiasing is always applied on
            bilinear or bicubic modes; on other modes (for PIL images and
            tensors), antialiasing makes no sense and this parameter is ignored.
            Possible values are:

231
            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
232
233
234
235
236
237
238
239
              Other mode aren't affected. This is probably what you want to use.
            - ``False``: will not apply antialiasing for tensors on any mode. PIL
              images are still antialiased on bilinear or bicubic modes, because
              PIL doesn't support no antialias.
            - ``None``: equivalent to ``False`` for tensors and ``True`` for
              PIL images. This value exists for legacy reasons and you probably
              don't want to use it unless you really know what you are doing.

240
241
            The default value changed from ``None`` to ``True`` in
            v0.17, for the PIL and Tensor backends to be consistent.
242
243
    """

244
245
246
247
248
249
250
251
    _v1_transform_cls = _transforms.RandomResizedCrop

    def __init__(
        self,
        size: Union[int, Sequence[int]],
        scale: Tuple[float, float] = (0.08, 1.0),
        ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
252
        antialias: Optional[bool] = True,
253
254
255
256
257
258
259
260
261
262
263
264
265
    ) -> None:
        super().__init__()
        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")

        if not isinstance(scale, Sequence):
            raise TypeError("Scale should be a sequence")
        if not isinstance(ratio, Sequence):
            raise TypeError("Ratio should be a sequence")
        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
            warnings.warn("Scale and ratio should be of kind (min, max)")

        self.scale = scale
        self.ratio = ratio
266
        self.interpolation = interpolation
267
268
269
270
271
        self.antialias = antialias

        self._log_ratio = torch.log(torch.tensor(self.ratio))

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
272
        height, width = query_size(flat_inputs)
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
        area = height * width

        log_ratio = self._log_ratio
        for _ in range(10):
            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
            aspect_ratio = torch.exp(
                torch.empty(1).uniform_(
                    log_ratio[0],  # type: ignore[arg-type]
                    log_ratio[1],  # type: ignore[arg-type]
                )
            ).item()

            w = int(round(math.sqrt(target_area * aspect_ratio)))
            h = int(round(math.sqrt(target_area / aspect_ratio)))

            if 0 < w <= width and 0 < h <= height:
                i = torch.randint(0, height - h + 1, size=(1,)).item()
                j = torch.randint(0, width - w + 1, size=(1,)).item()
                break
        else:
            # Fallback to central crop
            in_ratio = float(width) / float(height)
            if in_ratio < min(self.ratio):
                w = width
                h = int(round(w / min(self.ratio)))
            elif in_ratio > max(self.ratio):
                h = height
                w = int(round(h * max(self.ratio)))
            else:  # whole image
                w = width
                h = height
            i = (height - h) // 2
            j = (width - w) // 2

        return dict(top=i, left=j, height=h, width=w)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
310
311
        return self._call_kernel(
            F.resized_crop, inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
312
313
314
315
        )


class FiveCrop(Transform):
316
    """Crop the image or video into four corners and the central crop.
317

318
319
    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
320
    For example, the image can have ``[..., C, H, W]`` shape.
321
322
323
324
325
326
327
328
329
330
331

    .. Note::
         This transform returns a tuple of images and there may be a mismatch in the number of
         inputs and targets your Dataset returns. See below for an example of how to deal with
         this.

    Args:
         size (sequence or int): Desired output size of the crop. If size is an ``int``
            instead of sequence like (h, w), a square crop of size (size, size) is made.
            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).

332
333
    Example:
        >>> class BatchMultiCrop(transforms.Transform):
334
        ...     def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
335
336
337
        ...         images_or_videos, labels = sample
        ...         batch_size = len(images_or_videos)
        ...         image_or_video = images_or_videos[0]
338
        ...         images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
339
340
341
        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
        ...         return images_or_videos, labels
        ...
342
        >>> image = tv_tensors.Image(torch.rand(3, 256, 256))
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
        >>> label = 3
        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
        >>> images, labels = transform(image, label)
        >>> images.shape
        torch.Size([5, 3, 224, 224])
        >>> labels
        tensor([3, 3, 3, 3, 3])
    """

    _v1_transform_cls = _transforms.FiveCrop

    def __init__(self, size: Union[int, Sequence[int]]) -> None:
        super().__init__()
        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")

Nicolas Hug's avatar
Nicolas Hug committed
358
    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
359
        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
360
361
            warnings.warn(
                f"{type(self).__name__}() is currently passing through inputs of type "
362
                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
363
            )
Nicolas Hug's avatar
Nicolas Hug committed
364
        return super()._call_kernel(functional, inpt, *args, **kwargs)
365

366
    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
367
        return self._call_kernel(F.five_crop, inpt, self.size)
368
369

    def _check_inputs(self, flat_inputs: List[Any]) -> None:
370
        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
371
            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
372
373
374


class TenCrop(Transform):
375
    """Crop the image or video into four corners and the central crop plus the flipped version of
376
377
    these (horizontal flipping is used by default).

378
379
    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
380
    For example, the image can have ``[..., C, H, W]`` shape.
381

382
    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
383
384
385
386
387
388
389
390
391
392

    .. Note::
         This transform returns a tuple of images and there may be a mismatch in the number of
         inputs and targets your Dataset returns. See below for an example of how to deal with
         this.

    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
393
        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
394
395
396
397
398
399
400
401
402
    """

    _v1_transform_cls = _transforms.TenCrop

    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
        super().__init__()
        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
        self.vertical_flip = vertical_flip

Nicolas Hug's avatar
Nicolas Hug committed
403
    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
404
        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
405
406
            warnings.warn(
                f"{type(self).__name__}() is currently passing through inputs of type "
407
                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
408
            )
Nicolas Hug's avatar
Nicolas Hug committed
409
        return super()._call_kernel(functional, inpt, *args, **kwargs)
410

411
    def _check_inputs(self, flat_inputs: List[Any]) -> None:
412
        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
413
            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
414

415
    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
416
        return self._call_kernel(F.ten_crop, inpt, self.size, vertical_flip=self.vertical_flip)
417
418
419


class Pad(Transform):
420
    """Pad the input on all sides with the given "pad" value.
421

422
423
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
424
425
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
426
427
428
429
430
431
432
433
434
435

    Args:
        padding (int or sequence): Padding on each border. If a single int is provided this
            is used to pad all borders. If sequence of length 2 is provided this is the padding
            on left/right and top/bottom respectively. If a sequence of length 4 is provided
            this is the padding for the left, top, right and bottom borders respectively.

            .. note::
                In torchscript mode padding as single int is not supported, use a sequence of
                length 1: ``[padding, ]``.
436
437
438
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
439
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
440
441
442
            ``Mask`` will be filled with 0.
        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
            Default is "constant".
443
444
445
446
447
448
449
450
451
452
453
454
455
456

            - constant: pads with a constant value, this value is specified with fill

            - edge: pads with the last value at the edge of the image.

            - reflect: pads with reflection of image without repeating the last value on the edge.
              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
              will result in [3, 2, 1, 2, 3, 4, 3, 2]

            - symmetric: pads with reflection of image repeating the last value on the edge.
              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
              will result in [2, 1, 1, 2, 3, 4, 4, 3]
    """

457
458
459
460
461
462
    _v1_transform_cls = _transforms.Pad

    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
        params = super()._extract_params_for_v1_transform()

        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
463
            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
464
465
466
467
468
469

        return params

    def __init__(
        self,
        padding: Union[int, Sequence[int]],
470
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
471
472
473
474
475
476
477
478
479
480
481
        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
    ) -> None:
        super().__init__()

        _check_padding_arg(padding)
        _check_padding_mode_arg(padding_mode)

        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
        if not isinstance(padding, int):
            padding = list(padding)
        self.padding = padding
482
483
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
484
485
486
        self.padding_mode = padding_mode

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
487
        fill = _get_fill(self._fill, type(inpt))
488
        return self._call_kernel(F.pad, inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
489
490
491


class RandomZoomOut(_RandomApplyTransform):
492
    """ "Zoom out" transformation from
493
494
495
496
497
498
499
500
501
502
503
504
    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.

    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
    Output spatial size is randomly sampled from original size up to a maximum size configured
    with ``side_range`` parameter:

    .. code-block:: python

        r = uniform_sample(side_range[0], side_range[1])
        output_width = input_width * r
        output_height = input_height * r

505
506
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
507
508
509
510
511
512
513
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

    Args:
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
514
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
515
516
517
            ``Mask`` will be filled with 0.
        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
            scale the input size.
518
        p (float, optional): probability that the zoom operation will be performed.
519
520
    """

521
522
    def __init__(
        self,
523
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
524
525
526
527
528
        side_range: Sequence[float] = (1.0, 4.0),
        p: float = 0.5,
    ) -> None:
        super().__init__(p=p)

529
530
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
531
532
533
534
535

        _check_sequence_input(side_range, "side_range", req_sizes=(2,))

        self.side_range = side_range
        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
536
            raise ValueError(f"Invalid side range provided {side_range}.")
537
538

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
539
        orig_h, orig_w = query_size(flat_inputs)
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554

        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
        canvas_width = int(orig_w * r)
        canvas_height = int(orig_h * r)

        r = torch.rand(2)
        left = int((canvas_width - orig_w) * r[0])
        top = int((canvas_height - orig_h) * r[1])
        right = canvas_width - (left + orig_w)
        bottom = canvas_height - (top + orig_h)
        padding = [left, top, right, bottom]

        return dict(padding=padding)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
555
        fill = _get_fill(self._fill, type(inpt))
556
        return self._call_kernel(F.pad, inpt, **params, fill=fill)
557
558
559


class RandomRotation(Transform):
560
    """Rotate the input by angle.
561

562
563
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
564
565
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
566
567
568
569
570

    Args:
        degrees (sequence or number): Range of degrees to select from.
            If degrees is a number instead of sequence like (min, max), the range of degrees
            will be (-degrees, +degrees).
571
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
572
573
574
575
576
577
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        expand (bool, optional): Optional expansion flag.
            If true, expands the output to make it large enough to hold the entire rotated image.
            If false or omitted, make the output image the same size as the input image.
578
            Note that the expand flag assumes rotation around the center (see note below) and no translation.
579
580
        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
            Default is the center of the image.
581
582
583
584
585
586
587

            .. note::

                In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
                center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
                differences of the resulting image size compared to using the image center in the first place. Thus, when
                setting ``expand=True``, it's best to leave ``center=None`` (default).
588
589
590
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
591
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
592
            ``Mask`` will be filled with 0.
593
594
595
596
597

    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters

    """

598
599
600
601
602
603
604
605
    _v1_transform_cls = _transforms.RandomRotation

    def __init__(
        self,
        degrees: Union[numbers.Number, Sequence],
        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
        expand: bool = False,
        center: Optional[List[float]] = None,
606
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
607
608
609
    ) -> None:
        super().__init__()
        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
610
        self.interpolation = interpolation
611
612
        self.expand = expand

613
614
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
615
616
617
618
619
620
621
622
623
624
625

        if center is not None:
            _check_sequence_input(center, "center", req_sizes=(2,))

        self.center = center

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
        return dict(angle=angle)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
626
        fill = _get_fill(self._fill, type(inpt))
627
628
        return self._call_kernel(
            F.rotate,
629
630
631
632
633
634
635
636
637
638
            inpt,
            **params,
            interpolation=self.interpolation,
            expand=self.expand,
            center=self.center,
            fill=fill,
        )


class RandomAffine(Transform):
639
    """Random affine transformation the input keeping center invariant.
640

641
642
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
643
644
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661

    Args:
        degrees (sequence or number): Range of degrees to select from.
            If degrees is a number instead of sequence like (min, max), the range of degrees
            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
            and vertical translations. For example translate=(a, b), then horizontal shift
            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
        shear (sequence or number, optional): Range of degrees to select from.
            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
            Will not apply shear by default.
662
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
663
664
665
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
666
667
668
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
669
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
670
            ``Mask`` will be filled with 0.
671
672
673
674
675
676
677
        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
            Default is the center of the image.

    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters

    """

678
679
680
681
682
683
684
685
686
    _v1_transform_cls = _transforms.RandomAffine

    def __init__(
        self,
        degrees: Union[numbers.Number, Sequence],
        translate: Optional[Sequence[float]] = None,
        scale: Optional[Sequence[float]] = None,
        shear: Optional[Union[int, float, Sequence[float]]] = None,
        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
687
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
        center: Optional[List[float]] = None,
    ) -> None:
        super().__init__()
        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
        if translate is not None:
            _check_sequence_input(translate, "translate", req_sizes=(2,))
            for t in translate:
                if not (0.0 <= t <= 1.0):
                    raise ValueError("translation values should be between 0 and 1")
        self.translate = translate
        if scale is not None:
            _check_sequence_input(scale, "scale", req_sizes=(2,))
            for s in scale:
                if s <= 0:
                    raise ValueError("scale values should be positive")
        self.scale = scale

        if shear is not None:
            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
        else:
            self.shear = shear

710
        self.interpolation = interpolation
711
712
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
713
714
715
716
717
718
719

        if center is not None:
            _check_sequence_input(center, "center", req_sizes=(2,))

        self.center = center

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
720
        height, width = query_size(flat_inputs)
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746

        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
        if self.translate is not None:
            max_dx = float(self.translate[0] * width)
            max_dy = float(self.translate[1] * height)
            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
            translate = (tx, ty)
        else:
            translate = (0, 0)

        if self.scale is not None:
            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
        else:
            scale = 1.0

        shear_x = shear_y = 0.0
        if self.shear is not None:
            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
            if len(self.shear) == 4:
                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()

        shear = (shear_x, shear_y)
        return dict(angle=angle, translate=translate, scale=scale, shear=shear)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
747
        fill = _get_fill(self._fill, type(inpt))
748
749
        return self._call_kernel(
            F.affine,
750
751
752
753
754
755
756
757
758
            inpt,
            **params,
            interpolation=self.interpolation,
            fill=fill,
            center=self.center,
        )


class RandomCrop(Transform):
759
    """Crop the input at a random location.
760

761
762
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
763
764
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
765
766
767
768
769
770
771
772
773
774
775
776
777
778

    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
        padding (int or sequence, optional): Optional padding on each border
            of the image. Default is None. If a single int is provided this
            is used to pad all borders. If sequence of length 2 is provided this is the padding
            on left/right and top/bottom respectively. If a sequence of length 4 is provided
            this is the padding for the left, top, right and bottom borders respectively.

            .. note::
                In torchscript mode padding as single int is not supported, use a sequence of
                length 1: ``[padding, ]``.
779
        pad_if_needed (boolean, optional): It will pad the image if smaller than the
780
781
            desired size to avoid raising an exception. Since cropping is done
            after padding, the padding seems to be done at a random offset.
782
783
784
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
785
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
786
787
            ``Mask`` will be filled with 0.
        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
            Default is constant.

            - constant: pads with a constant value, this value is specified with fill

            - edge: pads with the last value at the edge of the image.

            - reflect: pads with reflection of image without repeating the last value on the edge.
              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
              will result in [3, 2, 1, 2, 3, 4, 3, 2]

            - symmetric: pads with reflection of image repeating the last value on the edge.
              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
              will result in [2, 1, 1, 2, 3, 4, 4, 3]
    """

803
804
805
806
807
808
    _v1_transform_cls = _transforms.RandomCrop

    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
        params = super()._extract_params_for_v1_transform()

        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
809
            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
810
811
812
813
814
815
816
817
818
819
820
821
822
823

        padding = self.padding
        if padding is not None:
            pad_left, pad_right, pad_top, pad_bottom = padding
            padding = [pad_left, pad_top, pad_right, pad_bottom]
        params["padding"] = padding

        return params

    def __init__(
        self,
        size: Union[int, Sequence[int]],
        padding: Optional[Union[int, Sequence[int]]] = None,
        pad_if_needed: bool = False,
824
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
825
826
827
828
829
830
831
832
833
834
835
836
837
        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
    ) -> None:
        super().__init__()

        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")

        if pad_if_needed or padding is not None:
            if padding is not None:
                _check_padding_arg(padding)
            _check_padding_mode_arg(padding_mode)

        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
        self.pad_if_needed = pad_if_needed
838
839
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
840
841
842
        self.padding_mode = padding_mode

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
843
        padded_height, padded_width = query_size(flat_inputs)
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901

        if self.padding is not None:
            pad_left, pad_right, pad_top, pad_bottom = self.padding
            padded_height += pad_top + pad_bottom
            padded_width += pad_left + pad_right
        else:
            pad_left = pad_right = pad_top = pad_bottom = 0

        cropped_height, cropped_width = self.size

        if self.pad_if_needed:
            if padded_height < cropped_height:
                diff = cropped_height - padded_height

                pad_top += diff
                pad_bottom += diff
                padded_height += 2 * diff

            if padded_width < cropped_width:
                diff = cropped_width - padded_width

                pad_left += diff
                pad_right += diff
                padded_width += 2 * diff

        if padded_height < cropped_height or padded_width < cropped_width:
            raise ValueError(
                f"Required crop size {(cropped_height, cropped_width)} is larger than "
                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
            )

        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
        padding = [pad_left, pad_top, pad_right, pad_bottom]
        needs_pad = any(padding)

        needs_vert_crop, top = (
            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
            if padded_height > cropped_height
            else (False, 0)
        )
        needs_horz_crop, left = (
            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
            if padded_width > cropped_width
            else (False, 0)
        )

        return dict(
            needs_crop=needs_vert_crop or needs_horz_crop,
            top=top,
            left=left,
            height=cropped_height,
            width=cropped_width,
            needs_pad=needs_pad,
            padding=padding,
        )

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
        if params["needs_pad"]:
902
            fill = _get_fill(self._fill, type(inpt))
903
            inpt = self._call_kernel(F.pad, inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
904
905

        if params["needs_crop"]:
906
907
908
            inpt = self._call_kernel(
                F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
            )
909
910
911
912
913

        return inpt


class RandomPerspective(_RandomApplyTransform):
914
    """Perform a random perspective transformation of the input with a given probability.
915

916
917
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
918
919
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
920
921

    Args:
922
        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
923
            Default is 0.5.
924
925
        p (float, optional): probability of the input being transformed. Default is 0.5.
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
926
927
928
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
929
930
931
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
932
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
933
            ``Mask`` will be filled with 0.
934
935
    """

936
937
938
939
940
941
    _v1_transform_cls = _transforms.RandomPerspective

    def __init__(
        self,
        distortion_scale: float = 0.5,
        p: float = 0.5,
942
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
943
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
944
945
946
947
948
949
950
    ) -> None:
        super().__init__(p=p)

        if not (0 <= distortion_scale <= 1):
            raise ValueError("Argument distortion_scale value should be between 0 and 1")

        self.distortion_scale = distortion_scale
951
        self.interpolation = interpolation
952
953
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
954
955

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
956
        height, width = query_size(flat_inputs)
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985

        distortion_scale = self.distortion_scale

        half_height = height // 2
        half_width = width // 2
        bound_height = int(distortion_scale * half_height) + 1
        bound_width = int(distortion_scale * half_width) + 1
        topleft = [
            int(torch.randint(0, bound_width, size=(1,))),
            int(torch.randint(0, bound_height, size=(1,))),
        ]
        topright = [
            int(torch.randint(width - bound_width, width, size=(1,))),
            int(torch.randint(0, bound_height, size=(1,))),
        ]
        botright = [
            int(torch.randint(width - bound_width, width, size=(1,))),
            int(torch.randint(height - bound_height, height, size=(1,))),
        ]
        botleft = [
            int(torch.randint(0, bound_width, size=(1,))),
            int(torch.randint(height - bound_height, height, size=(1,))),
        ]
        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
        endpoints = [topleft, topright, botright, botleft]
        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
        return dict(coefficients=perspective_coeffs)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
986
        fill = _get_fill(self._fill, type(inpt))
987
988
        return self._call_kernel(
            F.perspective,
989
            inpt,
990
991
            startpoints=None,
            endpoints=None,
992
993
994
995
996
997
998
            fill=fill,
            interpolation=self.interpolation,
            **params,
        )


class ElasticTransform(Transform):
999
    """Transform the input with elastic transformations.
1000

1001
1002
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

    Given alpha and sigma, it will generate displacement
    vectors for all pixels based on random offsets. Alpha controls the strength
    and sigma controls the smoothness of the displacements.
    The displacements are added to an identity grid and the resulting grid is
    used to transform the input.

    .. note::
        Implementation to transform bounding boxes is approximative (not exact).
1014
        We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
        Our assumption is that ``displacement * displacement`` is small and can be ignored.
        Large displacements would lead to large errors in the approximation.

    Applications:
        Randomly transforms the morphology of objects in images and produces a
        see-through-water-like effect.

    Args:
        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
            Fill value can be also a dictionary mapping data type to the fill value, e.g.
1033
            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
1034
1035
1036
            ``Mask`` will be filled with 0.
    """

1037
1038
1039
1040
1041
1042
1043
    _v1_transform_cls = _transforms.ElasticTransform

    def __init__(
        self,
        alpha: Union[float, Sequence[float]] = 50.0,
        sigma: Union[float, Sequence[float]] = 5.0,
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
1044
        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
1045
1046
    ) -> None:
        super().__init__()
1047
1048
        self.alpha = _setup_number_or_seq(alpha, "alpha")
        self.sigma = _setup_number_or_seq(sigma, "sigma")
1049

1050
        self.interpolation = interpolation
1051
1052
        self.fill = fill
        self._fill = _setup_fill_arg(fill)
1053
1054

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
1055
        size = list(query_size(flat_inputs))
1056
1057
1058
1059
1060
1061
1062

        dx = torch.rand([1, 1] + size) * 2 - 1
        if self.sigma[0] > 0.0:
            kx = int(8 * self.sigma[0] + 1)
            # if kernel size is even we have to make it odd
            if kx % 2 == 0:
                kx += 1
1063
            dx = self._call_kernel(F.gaussian_blur, dx, [kx, kx], list(self.sigma))
1064
1065
1066
1067
1068
1069
1070
1071
        dx = dx * self.alpha[0] / size[0]

        dy = torch.rand([1, 1] + size) * 2 - 1
        if self.sigma[1] > 0.0:
            ky = int(8 * self.sigma[1] + 1)
            # if kernel size is even we have to make it odd
            if ky % 2 == 0:
                ky += 1
1072
            dy = self._call_kernel(F.gaussian_blur, dy, [ky, ky], list(self.sigma))
1073
1074
1075
1076
1077
        dy = dy * self.alpha[1] / size[1]
        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
        return dict(displacement=displacement)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
1078
        fill = _get_fill(self._fill, type(inpt))
1079
1080
        return self._call_kernel(
            F.elastic,
1081
1082
1083
1084
1085
1086
1087
1088
            inpt,
            **params,
            fill=fill,
            interpolation=self.interpolation,
        )


class RandomIoUCrop(Transform):
1089
    """Random IoU crop transformation from
1090
1091
    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.

1092
    This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
1093
1094
1095

    .. warning::
        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
1096
        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
1097
1098
        after or later in the transforms pipeline.

1099
1100
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

    Args:
        min_scale (float, optional): Minimum factors to scale the input size.
        max_scale (float, optional): Maximum factors to scale the input size.
        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
            Default, 40.
    """

1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
    def __init__(
        self,
        min_scale: float = 0.3,
        max_scale: float = 1.0,
        min_aspect_ratio: float = 0.5,
        max_aspect_ratio: float = 2.0,
        sampler_options: Optional[List[float]] = None,
        trials: int = 40,
    ):
        super().__init__()
        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.min_aspect_ratio = min_aspect_ratio
        self.max_aspect_ratio = max_aspect_ratio
        if sampler_options is None:
            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
        self.options = sampler_options
        self.trials = trials

    def _check_inputs(self, flat_inputs: List[Any]) -> None:
        if not (
1137
1138
            has_all(flat_inputs, tv_tensors.BoundingBoxes)
            and has_any(flat_inputs, PIL.Image.Image, tv_tensors.Image, is_pure_tensor)
1139
1140
1141
1142
1143
1144
1145
        ):
            raise TypeError(
                f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
                "and bounding boxes. Sample can also contain masks."
            )

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
1146
        orig_h, orig_w = query_size(flat_inputs)
1147
        bboxes = get_bounding_boxes(flat_inputs)
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174

        while True:
            # sample an option
            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
            min_jaccard_overlap = self.options[idx]
            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
                return dict()

            for _ in range(self.trials):
                # check the aspect ratio limitations
                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
                new_w = int(orig_w * r[0])
                new_h = int(orig_h * r[1])
                aspect_ratio = new_w / new_h
                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
                    continue

                # check for 0 area crops
                r = torch.rand(2)
                left = int((orig_w - new_w) * r[0])
                top = int((orig_h - new_h) * r[1])
                right = left + new_w
                bottom = top + new_h
                if left == right or top == bottom:
                    continue

                # check for any valid boxes with centers within the crop area
Nicolas Hug's avatar
Nicolas Hug committed
1175
                xyxy_bboxes = F.convert_bounding_box_format(
1176
1177
                    bboxes.as_subclass(torch.Tensor),
                    bboxes.format,
1178
                    tv_tensors.BoundingBoxFormat.XYXY,
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
                )
                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
                if not is_within_crop_area.any():
                    continue

                # check at least 1 box with jaccard limitations
                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
                ious = box_iou(
                    xyxy_bboxes,
                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
                )
                if ious.max() < min_jaccard_overlap:
                    continue

                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:

        if len(params) < 1:
            return inpt

1202
1203
1204
        output = self._call_kernel(
            F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
        )
1205

1206
        if isinstance(output, tv_tensors.BoundingBoxes):
1207
            # We "mark" the invalid boxes as degenreate, and they can be
1208
            # removed by a later call to SanitizeBoundingBoxes()
1209
            output[~params["is_within_crop_area"]] = 0
1210
1211
1212
1213
1214

        return output


class ScaleJitter(Transform):
1215
    """Perform Large Scale Jitter on the input according to
1216
1217
    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.

1218
1219
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

    Args:
        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        antialias (bool, optional): Whether to apply antialiasing.
            It only affects **tensors** with bilinear or bicubic modes and it is
            ignored otherwise: on PIL images, antialiasing is always applied on
            bilinear or bicubic modes; on other modes (for PIL images and
            tensors), antialiasing makes no sense and this parameter is ignored.
            Possible values are:

1239
            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
1240
1241
1242
1243
1244
1245
1246
1247
              Other mode aren't affected. This is probably what you want to use.
            - ``False``: will not apply antialiasing for tensors on any mode. PIL
              images are still antialiased on bilinear or bicubic modes, because
              PIL doesn't support no antialias.
            - ``None``: equivalent to ``False`` for tensors and ``True`` for
              PIL images. This value exists for legacy reasons and you probably
              don't want to use it unless you really know what you are doing.

1248
1249
            The default value changed from ``None`` to ``True`` in
            v0.17, for the PIL and Tensor backends to be consistent.
1250
1251
    """

1252
1253
1254
1255
1256
    def __init__(
        self,
        target_size: Tuple[int, int],
        scale_range: Tuple[float, float] = (0.1, 2.0),
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
1257
        antialias: Optional[bool] = True,
1258
1259
1260
1261
    ):
        super().__init__()
        self.target_size = target_size
        self.scale_range = scale_range
1262
        self.interpolation = interpolation
1263
1264
1265
        self.antialias = antialias

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
1266
        orig_height, orig_width = query_size(flat_inputs)
1267
1268
1269
1270
1271
1272
1273
1274
1275

        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
        new_width = int(orig_width * r)
        new_height = int(orig_height * r)

        return dict(size=(new_height, new_width))

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
1276
1277
1278
        return self._call_kernel(
            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
        )
1279
1280
1281


class RandomShortestSize(Transform):
1282
    """Randomly resize the input.
1283

1284
1285
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

    Args:
        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
        max_size (int, optional): Maximum spatial size. Default, None.
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        antialias (bool, optional): Whether to apply antialiasing.
            It only affects **tensors** with bilinear or bicubic modes and it is
            ignored otherwise: on PIL images, antialiasing is always applied on
            bilinear or bicubic modes; on other modes (for PIL images and
            tensors), antialiasing makes no sense and this parameter is ignored.
            Possible values are:

1304
            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
1305
1306
1307
1308
1309
1310
1311
1312
              Other mode aren't affected. This is probably what you want to use.
            - ``False``: will not apply antialiasing for tensors on any mode. PIL
              images are still antialiased on bilinear or bicubic modes, because
              PIL doesn't support no antialias.
            - ``None``: equivalent to ``False`` for tensors and ``True`` for
              PIL images. This value exists for legacy reasons and you probably
              don't want to use it unless you really know what you are doing.

1313
1314
            The default value changed from ``None`` to ``True`` in
            v0.17, for the PIL and Tensor backends to be consistent.
1315
1316
    """

1317
1318
1319
1320
1321
    def __init__(
        self,
        min_size: Union[List[int], Tuple[int], int],
        max_size: Optional[int] = None,
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
1322
        antialias: Optional[bool] = True,
1323
1324
1325
1326
    ):
        super().__init__()
        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
        self.max_size = max_size
1327
        self.interpolation = interpolation
1328
1329
1330
        self.antialias = antialias

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
Philip Meier's avatar
Philip Meier committed
1331
        orig_height, orig_width = query_size(flat_inputs)
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343

        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
        r = min_size / min(orig_height, orig_width)
        if self.max_size is not None:
            r = min(r, self.max_size / max(orig_height, orig_width))

        new_width = int(orig_width * r)
        new_height = int(orig_height * r)

        return dict(size=(new_height, new_width))

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
1344
1345
1346
        return self._call_kernel(
            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
        )
1347
1348
1349


class RandomResize(Transform):
1350
    """Randomly resize the input.
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362

    This transformation can be used together with ``RandomCrop`` as data augmentations to train
    models on image segmentation task.

    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:

    .. code-block:: python

        size = uniform_sample(min_size, max_size)
        output_width = size
        output_height = size

1363
1364
    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
    it can have arbitrary number of leading batch dimensions. For example,
    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.

    Args:
        min_size (int): Minimum output size for random sampling
        max_size (int): Maximum output size for random sampling
        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
        antialias (bool, optional): Whether to apply antialiasing.
            It only affects **tensors** with bilinear or bicubic modes and it is
            ignored otherwise: on PIL images, antialiasing is always applied on
            bilinear or bicubic modes; on other modes (for PIL images and
            tensors), antialiasing makes no sense and this parameter is ignored.
            Possible values are:

1383
            - ``True`` (default): will apply antialiasing for bilinear or bicubic modes.
1384
1385
1386
1387
1388
1389
1390
1391
              Other mode aren't affected. This is probably what you want to use.
            - ``False``: will not apply antialiasing for tensors on any mode. PIL
              images are still antialiased on bilinear or bicubic modes, because
              PIL doesn't support no antialias.
            - ``None``: equivalent to ``False`` for tensors and ``True`` for
              PIL images. This value exists for legacy reasons and you probably
              don't want to use it unless you really know what you are doing.

1392
1393
            The default value changed from ``None`` to ``True`` in
            v0.17, for the PIL and Tensor backends to be consistent.
1394
1395
    """

1396
1397
1398
1399
1400
    def __init__(
        self,
        min_size: int,
        max_size: int,
        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
1401
        antialias: Optional[bool] = True,
1402
1403
1404
1405
    ) -> None:
        super().__init__()
        self.min_size = min_size
        self.max_size = max_size
1406
        self.interpolation = interpolation
1407
1408
1409
1410
1411
1412
1413
        self.antialias = antialias

    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
        size = int(torch.randint(self.min_size, self.max_size, ()))
        return dict(size=[size])

    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
1414
1415
1416
        return self._call_kernel(
            F.resize, inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias
        )