"src/diffusers/pipeline_flax_utils.py" did not exist on "17c574a16dd505d3b280f39681e9715b7e252194"
transforms_v2_kernel_infos.py 77.2 KB
Newer Older
1
import decimal
2
3
4
5
6
import functools
import itertools
import math

import numpy as np
7
import PIL.Image
8
9
import pytest
import torch.testing
10
import torchvision.ops
11
import torchvision.transforms.v2.functional as F
12
from common_utils import (
13
    ArgsKwargs,
14
    combinations_grid,
15
16
    get_num_channels,
    ImageLoader,
17
    InfoBase,
18
    make_bounding_box_loader,
19
    make_bounding_box_loaders,
20
    make_detection_mask_loader,
21
22
    make_image_loader,
    make_image_loaders,
23
    make_image_loaders_for_interpolation,
24
    make_mask_loaders,
25
    make_video_loader,
26
    make_video_loaders,
27
28
    mark_framework_limitation,
    TestMark,
29
)
30
from torch.utils._pytree import tree_map
31
from torchvision import datapoints
32
from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
33
34
35
36

__all__ = ["KernelInfo", "KERNEL_INFOS"]


37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
class KernelInfo(InfoBase):
    def __init__(
        self,
        kernel,
        *,
        # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
        # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
        kernel_name=None,
        # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but
        # should not include extensive parameter combinations to keep to overall test count moderate.
        sample_inputs_fn,
        # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also
        # take tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should
        # happen inside the function. It should return a tensor or to be more precise an object that can be compared to
        # a tensor by `assert_close`. If omitted, no reference test will be performed.
        reference_fn=None,
        # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter
        # values to be tested. If not specified, `sample_inputs_fn` will be used.
        reference_inputs_fn=None,
56
        # If true-ish, triggers a test that checks the kernel for consistency between uint8 and float32 inputs with the
57
        # reference inputs. This is usually used whenever we use a PIL kernel as reference.
58
59
60
61
        # Can be a callable in which case it will be called with `other_args, kwargs`. It should return the same
        # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input
        # dtype.
        float32_vs_uint8=False,
62
63
64
        # Some kernels don't have dispatchers that would handle logging the usage. Thus, the kernel has to do it
        # manually. If set, triggers a test that makes sure this happens.
        logs_usage=False,
65
66
67
68
69
70
71
72
73
74
        # See InfoBase
        test_marks=None,
        # See InfoBase
        closeness_kwargs=None,
    ):
        super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
        self.kernel = kernel
        self.sample_inputs_fn = sample_inputs_fn
        self.reference_fn = reference_fn
        self.reference_inputs_fn = reference_inputs_fn
75

76
77
78
        if float32_vs_uint8 and not callable(float32_vs_uint8):
            float32_vs_uint8 = lambda other_args, kwargs: (other_args, kwargs)  # noqa: E731
        self.float32_vs_uint8 = float32_vs_uint8
79
        self.logs_usage = logs_usage
80
81


82
def pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False):
83
    return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae)
84
85
86
87


def cuda_vs_cpu_pixel_difference(atol=1):
    return {
88
        (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): pixel_difference_closeness_kwargs(atol, dtype=dtype)
89
90
91
92
        for dtype in [torch.uint8, torch.float32]
    }


93
def pil_reference_pixel_difference(atol=1, mae=False):
94
    return {
95
        (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(
96
            atol, mae=mae
97
98
99
100
        )
    }


101
def float32_vs_uint8_pixel_difference(atol=1, mae=False):
102
103
104
105
106
    return {
        (
            ("TestKernels", "test_float32_vs_uint8"),
            torch.float32,
            "cpu",
107
        ): pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae)
108
    }
109

110

111
def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6):
112
113
114
115
116
    return {
        (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False},
    }


117
118
def pil_reference_wrapper(pil_kernel):
    @functools.wraps(pil_kernel)
119
120
121
122
    def wrapper(input_tensor, *other_args, **kwargs):
        if input_tensor.dtype != torch.uint8:
            raise pytest.UsageError(f"Can only test uint8 tensor images against PIL, but input is {input_tensor.dtype}")
        if input_tensor.ndim > 3:
123
            raise pytest.UsageError(
124
                f"Can only test single tensor images against PIL, but input has shape {input_tensor.shape}"
125
126
            )

127
128
129
130
131
132
133
134
135
136
137
138
139
140
        input_pil = F.to_image_pil(input_tensor)
        output_pil = pil_kernel(input_pil, *other_args, **kwargs)
        if not isinstance(output_pil, PIL.Image.Image):
            return output_pil

        output_tensor = F.to_image_tensor(output_pil)

        # 2D mask shenanigans
        if output_tensor.ndim == 2 and input_tensor.ndim == 3:
            output_tensor = output_tensor.unsqueeze(0)
        elif output_tensor.ndim == 3 and input_tensor.ndim == 2:
            output_tensor = output_tensor.squeeze(0)

        return output_tensor
141
142
143
144

    return wrapper


145
146
147
148
def xfail_jit(reason, *, condition=None):
    return TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.xfail(reason=reason), condition=condition)


149
def xfail_jit_python_scalar_arg(name, *, reason=None):
150
151
    return xfail_jit(
        reason or f"Python scalar int or float for `{name}` is not supported when scripting",
152
153
154
155
        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
    )


156
157
158
159
160
161
162
163
164
165
166
KERNEL_INFOS = []


_AFFINE_KWARGS = combinations_grid(
    angle=[-87, 15, 90],
    translate=[(5, 5), (-5, -5)],
    scale=[0.77, 1.27],
    shear=[(12, 12), (0, 0)],
)


167
168
169
170
171
172
173
174
175
176
def _diversify_affine_kwargs_types(affine_kwargs):
    angle = affine_kwargs["angle"]
    for diverse_angle in [int(angle), float(angle)]:
        yield dict(affine_kwargs, angle=diverse_angle)

    shear = affine_kwargs["shear"]
    for diverse_shear in [tuple(shear), list(shear), int(shear[0]), float(shear[0])]:
        yield dict(affine_kwargs, shear=diverse_shear)


177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def _full_affine_params(**partial_params):
    partial_params.setdefault("angle", 0.0)
    partial_params.setdefault("translate", [0.0, 0.0])
    partial_params.setdefault("scale", 1.0)
    partial_params.setdefault("shear", [0.0, 0.0])
    partial_params.setdefault("center", None)
    return partial_params


_DIVERSE_AFFINE_PARAMS = [
    _full_affine_params(**{name: arg})
    for name, args in [
        ("angle", [1.0, 2]),
        ("translate", [[1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
        ("scale", [0.5]),
        ("shear", [1.0, 2, [1.0], [2], (1.0,), (2,), [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
        ("center", [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]),
    ]
    for arg in args
]


199
def get_fills(*, num_channels, dtype):
200
201
    yield None

202
203
204
205
    int_value = get_max_value(dtype)
    float_value = int_value / 2
    yield int_value
    yield float_value
206

207
208
209
    for vector_type in [list, tuple]:
        yield vector_type([int_value])
        yield vector_type([float_value])
210

211
212
213
        if num_channels > 1:
            yield vector_type(float_value * c / 10 for c in range(num_channels))
            yield vector_type(int_value if c % 2 == 0 else 0 for c in range(num_channels))
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228


def float32_vs_uint8_fill_adapter(other_args, kwargs):
    fill = kwargs.get("fill")
    if fill is None:
        return other_args, kwargs

    if isinstance(fill, (int, float)):
        fill /= 255
    else:
        fill = type(fill)(fill_ / 255 for fill_ in fill)

    return other_args, dict(kwargs, fill=fill)


229
def sample_inputs_affine_image_tensor():
230
    make_affine_image_loaders = functools.partial(
231
        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
232
233
234
235
236
237
    )

    for image_loader, affine_params in itertools.product(make_affine_image_loaders(), _DIVERSE_AFFINE_PARAMS):
        yield ArgsKwargs(image_loader, **affine_params)

    for image_loader in make_affine_image_loaders():
238
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
239
240
241
242
            yield ArgsKwargs(image_loader, **_full_affine_params(), fill=fill)

    for image_loader, interpolation in itertools.product(
        make_affine_image_loaders(),
243
244
245
246
        [
            F.InterpolationMode.NEAREST,
            F.InterpolationMode.BILINEAR,
        ],
247
    ):
248
        yield ArgsKwargs(image_loader, **_full_affine_params(), fill=0)
249

250
251

def reference_inputs_affine_image_tensor():
252
    for image_loader, affine_kwargs in itertools.product(make_image_loaders_for_interpolation(), _AFFINE_KWARGS):
253
        yield ArgsKwargs(
254
            image_loader,
255
256
257
258
259
260
            interpolation=F.InterpolationMode.NEAREST,
            **affine_kwargs,
        )


def sample_inputs_affine_bounding_box():
261
    for bounding_box_loader, affine_params in itertools.product(
262
        make_bounding_box_loaders(formats=[datapoints.BoundingBoxFormat.XYXY]), _DIVERSE_AFFINE_PARAMS
263
264
265
266
    ):
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
267
            spatial_size=bounding_box_loader.spatial_size,
268
            **affine_params,
269
270
        )

271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294

def _compute_affine_matrix(angle, translate, scale, shear, center):
    rot = math.radians(angle)
    cx, cy = center
    tx, ty = translate
    sx, sy = [math.radians(sh_) for sh_ in shear]

    c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
    t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
    c_matrix_inv = np.linalg.inv(c_matrix)
    rs_matrix = np.array(
        [
            [scale * math.cos(rot), -scale * math.sin(rot), 0],
            [scale * math.sin(rot), scale * math.cos(rot), 0],
            [0, 0, 1],
        ]
    )
    shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
    shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
    rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
    true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
    return true_matrix


295
296
def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
    def transform(bbox, affine_matrix_, format_, spatial_size_):
297
298
        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
        in_dtype = bbox.dtype
299
300
        if not torch.is_floating_point(bbox):
            bbox = bbox.float()
301
        bbox_xyxy = F.convert_format_bounding_box(
302
303
304
305
            bbox.as_subclass(torch.Tensor),
            old_format=format_,
            new_format=datapoints.BoundingBoxFormat.XYXY,
            inplace=True,
306
        )
307
308
309
310
311
312
313
314
        points = np.array(
            [
                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
            ]
        )
315
        transformed_points = np.matmul(points, affine_matrix_.T)
316
317
        out_bbox = torch.tensor(
            [
318
319
320
321
                np.min(transformed_points[:, 0]).item(),
                np.min(transformed_points[:, 1]).item(),
                np.max(transformed_points[:, 0]).item(),
                np.max(transformed_points[:, 1]).item(),
322
            ],
323
            dtype=bbox_xyxy.dtype,
324
        )
325
        out_bbox = F.convert_format_bounding_box(
326
            out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
327
        )
328
329
330
331
        # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
        out_bbox = out_bbox.to(dtype=in_dtype)
        return out_bbox
332
333
334
335

    if bounding_box.ndim < 2:
        bounding_box = [bounding_box]

336
    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
337
338
339
340
341
342
343
344
    if len(expected_bboxes) > 1:
        expected_bboxes = torch.stack(expected_bboxes)
    else:
        expected_bboxes = expected_bboxes[0]

    return expected_bboxes


345
346
347
348
349
350
351
def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
    if center is None:
        center = [s * 0.5 for s in spatial_size[::-1]]

    affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
    affine_matrix = affine_matrix[:2, :]

352
353
354
    expected_bboxes = reference_affine_bounding_box_helper(
        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
    )
355
356
357
358

    return expected_bboxes


359
def reference_inputs_affine_bounding_box():
360
361
362
    for bounding_box_loader, affine_kwargs in itertools.product(
        make_bounding_box_loaders(extra_dims=[()]),
        _AFFINE_KWARGS,
363
364
365
366
    ):
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
367
            spatial_size=bounding_box_loader.spatial_size,
368
            **affine_kwargs,
369
370
371
        )


372
def sample_inputs_affine_mask():
373
374
    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
        yield ArgsKwargs(mask_loader, **_full_affine_params())
375

376

377
378
379
380
381
def sample_inputs_affine_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, **_full_affine_params())


382
383
384
385
386
387
388
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.affine_image_tensor,
            sample_inputs_fn=sample_inputs_affine_image_tensor,
            reference_fn=pil_reference_wrapper(F.affine_image_pil),
            reference_inputs_fn=reference_inputs_affine_image_tensor,
389
            float32_vs_uint8=True,
390
            closeness_kwargs=pil_reference_pixel_difference(10, mae=True),
391
392
            test_marks=[
                xfail_jit_python_scalar_arg("shear"),
393
                xfail_jit_python_scalar_arg("fill"),
394
            ],
395
396
397
398
399
400
        ),
        KernelInfo(
            F.affine_bounding_box,
            sample_inputs_fn=sample_inputs_affine_bounding_box,
            reference_fn=reference_affine_bounding_box,
            reference_inputs_fn=reference_inputs_affine_bounding_box,
401
            test_marks=[
402
                xfail_jit_python_scalar_arg("shear"),
403
            ],
404
        ),
405
406
        KernelInfo(
            F.affine_mask,
407
            sample_inputs_fn=sample_inputs_affine_mask,
408
409
410
            test_marks=[
                xfail_jit_python_scalar_arg("shear"),
            ],
411
        ),
412
413
414
415
        KernelInfo(
            F.affine_video,
            sample_inputs_fn=sample_inputs_affine_video,
        ),
416
417
    ]
)
418
419
420


def sample_inputs_convert_format_bounding_box():
421
    formats = list(datapoints.BoundingBoxFormat)
422
    for bounding_box_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
423
        yield ArgsKwargs(bounding_box_loader, old_format=bounding_box_loader.format, new_format=new_format)
424
425


426
def reference_convert_format_bounding_box(bounding_box, old_format, new_format):
427
    return torchvision.ops.box_convert(
428
429
        bounding_box, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()
    ).to(bounding_box.dtype)
430
431
432


def reference_inputs_convert_format_bounding_box():
433
    for args_kwargs in sample_inputs_convert_format_bounding_box():
434
435
        if len(args_kwargs.args[0].shape) == 2:
            yield args_kwargs
436
437
438
439
440
441
442
443


KERNEL_INFOS.append(
    KernelInfo(
        F.convert_format_bounding_box,
        sample_inputs_fn=sample_inputs_convert_format_bounding_box,
        reference_fn=reference_convert_format_bounding_box,
        reference_inputs_fn=reference_inputs_convert_format_bounding_box,
444
        logs_usage=True,
445
446
447
448
    ),
)


449
450
451
452
453
454
def sample_inputs_vertical_flip_image_tensor():
    for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.float32]):
        yield ArgsKwargs(image_loader)


def reference_inputs_vertical_flip_image_tensor():
455
    for image_loader in make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]):
456
457
458
459
460
        yield ArgsKwargs(image_loader)


def sample_inputs_vertical_flip_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders(
461
        formats=[datapoints.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
462
463
    ):
        yield ArgsKwargs(
464
            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
465
466
467
468
469
470
471
472
        )


def sample_inputs_vertical_flip_mask():
    for image_loader in make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]):
        yield ArgsKwargs(image_loader)


473
474
475
476
477
def sample_inputs_vertical_flip_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader)


478
479
480
481
482
483
def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
    affine_matrix = np.array(
        [
            [1, 0, 0],
            [0, -1, spatial_size[0]],
        ],
484
        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
485
486
    )

487
488
489
    expected_bboxes = reference_affine_bounding_box_helper(
        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
    )
490
491
492
493

    return expected_bboxes


494
495
496
497
498
499
500
501
502
def reference_inputs_vertical_flip_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders(extra_dims=[()]):
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
            spatial_size=bounding_box_loader.spatial_size,
        )


503
504
505
506
507
508
509
510
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.vertical_flip_image_tensor,
            kernel_name="vertical_flip_image_tensor",
            sample_inputs_fn=sample_inputs_vertical_flip_image_tensor,
            reference_fn=pil_reference_wrapper(F.vertical_flip_image_pil),
            reference_inputs_fn=reference_inputs_vertical_flip_image_tensor,
511
            float32_vs_uint8=True,
512
513
514
515
        ),
        KernelInfo(
            F.vertical_flip_bounding_box,
            sample_inputs_fn=sample_inputs_vertical_flip_bounding_box,
516
            reference_fn=reference_vertical_flip_bounding_box,
517
            reference_inputs_fn=reference_inputs_vertical_flip_bounding_box,
518
519
520
521
522
        ),
        KernelInfo(
            F.vertical_flip_mask,
            sample_inputs_fn=sample_inputs_vertical_flip_mask,
        ),
523
524
525
526
        KernelInfo(
            F.vertical_flip_video,
            sample_inputs_fn=sample_inputs_vertical_flip_video,
        ),
527
528
529
530
531
532
533
    ]
)

_ROTATE_ANGLES = [-87, 15, 90]


def sample_inputs_rotate_image_tensor():
534
    make_rotate_image_loaders = functools.partial(
535
        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
536
537
538
539
540
541
542
    )

    for image_loader in make_rotate_image_loaders():
        yield ArgsKwargs(image_loader, angle=15.0, expand=True)

    for image_loader, center in itertools.product(
        make_rotate_image_loaders(), [None, [1.0, 0.5], [1, 2], (1.0, 0.5), (1, 2)]
543
    ):
544
        yield ArgsKwargs(image_loader, angle=15.0, center=center)
545

546
    for image_loader in make_rotate_image_loaders():
547
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
548
549
550
551
552
553
554
            yield ArgsKwargs(image_loader, angle=15.0, fill=fill)

    for image_loader, interpolation in itertools.product(
        make_rotate_image_loaders(),
        [F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
    ):
        yield ArgsKwargs(image_loader, angle=15.0, fill=0)
555
556
557


def reference_inputs_rotate_image_tensor():
558
    for image_loader, angle in itertools.product(make_image_loaders_for_interpolation(), _ROTATE_ANGLES):
559
560
561
562
563
564
565
566
        yield ArgsKwargs(image_loader, angle=angle)


def sample_inputs_rotate_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
567
            spatial_size=bounding_box_loader.spatial_size,
568
569
570
571
            angle=_ROTATE_ANGLES[0],
        )


572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
def reference_inputs_rotate_bounding_box():
    for bounding_box_loader, angle in itertools.product(
        make_bounding_box_loaders(extra_dims=((), (4,))), _ROTATE_ANGLES
    ):
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
            spatial_size=bounding_box_loader.spatial_size,
            angle=angle,
        )

    # TODO: add samples with expand=True and center


def reference_rotate_bounding_box(bounding_box, *, format, spatial_size, angle, expand=False, center=None):

    if center is None:
        center = [spatial_size[1] * 0.5, spatial_size[0] * 0.5]

    a = np.cos(angle * np.pi / 180.0)
    b = np.sin(angle * np.pi / 180.0)
    cx = center[0]
    cy = center[1]
    affine_matrix = np.array(
        [
            [a, b, cx - cx * a - b * cy],
            [-b, a, cy + cx * b - a * cy],
        ],
        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
    )

    expected_bboxes = reference_affine_bounding_box_helper(
        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
    )
    return expected_bboxes, spatial_size


609
def sample_inputs_rotate_mask():
610
611
    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
        yield ArgsKwargs(mask_loader, angle=15.0)
612
613


614
615
616
617
618
def sample_inputs_rotate_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, angle=15.0)


619
620
621
622
623
624
625
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.rotate_image_tensor,
            sample_inputs_fn=sample_inputs_rotate_image_tensor,
            reference_fn=pil_reference_wrapper(F.rotate_image_pil),
            reference_inputs_fn=reference_inputs_rotate_image_tensor,
626
            float32_vs_uint8=True,
627
            closeness_kwargs=pil_reference_pixel_difference(1, mae=True),
628
            test_marks=[
629
                xfail_jit_python_scalar_arg("fill"),
630
            ],
631
632
633
634
        ),
        KernelInfo(
            F.rotate_bounding_box,
            sample_inputs_fn=sample_inputs_rotate_bounding_box,
635
636
            reference_fn=reference_rotate_bounding_box,
            reference_inputs_fn=reference_inputs_rotate_bounding_box,
637
            closeness_kwargs={
638
639
                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-4, rtol=1e-4),
                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-4, rtol=1e-4),
640
            },
641
642
643
644
645
        ),
        KernelInfo(
            F.rotate_mask,
            sample_inputs_fn=sample_inputs_rotate_mask,
        ),
646
647
648
649
        KernelInfo(
            F.rotate_video,
            sample_inputs_fn=sample_inputs_rotate_video,
        ),
650
651
652
653
654
655
656
    ]
)

_CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20], width=[12, 20])


def sample_inputs_crop_image_tensor():
657
    for image_loader, params in itertools.product(
658
        make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]),
659
660
661
662
663
664
665
666
        [
            dict(top=4, left=3, height=7, width=8),
            dict(top=-1, left=3, height=7, width=8),
            dict(top=4, left=-1, height=7, width=8),
            dict(top=4, left=3, height=17, width=8),
            dict(top=4, left=3, height=7, width=18),
        ],
    ):
667
668
669
670
        yield ArgsKwargs(image_loader, **params)


def reference_inputs_crop_image_tensor():
671
672
673
    for image_loader, params in itertools.product(
        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _CROP_PARAMS
    ):
674
675
676
677
678
679
680
        yield ArgsKwargs(image_loader, **params)


def sample_inputs_crop_bounding_box():
    for bounding_box_loader, params in itertools.product(
        make_bounding_box_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
    ):
681
        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
682
683
684


def sample_inputs_crop_mask():
685
686
    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=["random"], num_objects=["random"]):
        yield ArgsKwargs(mask_loader, top=4, left=3, height=7, width=8)
687
688
689
690
691
692
693


def reference_inputs_crop_mask():
    for mask_loader, params in itertools.product(make_mask_loaders(extra_dims=[()], num_objects=[1]), _CROP_PARAMS):
        yield ArgsKwargs(mask_loader, **params)


694
695
696
697
698
def sample_inputs_crop_video():
    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=["random"]):
        yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)


699
700
701
702
703
704
def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
    affine_matrix = np.array(
        [
            [1, 0, -left],
            [0, 1, -top],
        ],
705
        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
706
707
    )

708
709
710
711
712
    spatial_size = (height, width)
    expected_bboxes = reference_affine_bounding_box_helper(
        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
    )
    return expected_bboxes, spatial_size
713
714
715
716
717
718
719
720
721


def reference_inputs_crop_bounding_box():
    for bounding_box_loader, params in itertools.product(
        make_bounding_box_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
    ):
        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)


722
723
724
725
726
727
728
729
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.crop_image_tensor,
            kernel_name="crop_image_tensor",
            sample_inputs_fn=sample_inputs_crop_image_tensor,
            reference_fn=pil_reference_wrapper(F.crop_image_pil),
            reference_inputs_fn=reference_inputs_crop_image_tensor,
730
            float32_vs_uint8=True,
731
732
733
734
        ),
        KernelInfo(
            F.crop_bounding_box,
            sample_inputs_fn=sample_inputs_crop_bounding_box,
735
736
            reference_fn=reference_crop_bounding_box,
            reference_inputs_fn=reference_inputs_crop_bounding_box,
737
738
739
740
741
742
        ),
        KernelInfo(
            F.crop_mask,
            sample_inputs_fn=sample_inputs_crop_mask,
            reference_fn=pil_reference_wrapper(F.crop_image_pil),
            reference_inputs_fn=reference_inputs_crop_mask,
743
            float32_vs_uint8=True,
744
        ),
745
746
747
748
        KernelInfo(
            F.crop_video,
            sample_inputs_fn=sample_inputs_crop_video,
        ),
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
    ]
)

_RESIZED_CROP_PARAMS = combinations_grid(top=[-8, 9], left=[-8, 9], height=[12], width=[12], size=[(16, 18)])


def sample_inputs_resized_crop_image_tensor():
    for image_loader in make_image_loaders():
        yield ArgsKwargs(image_loader, **_RESIZED_CROP_PARAMS[0])


@pil_reference_wrapper
def reference_resized_crop_image_tensor(*args, **kwargs):
    if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in {
        F.InterpolationMode.BILINEAR,
        F.InterpolationMode.BICUBIC,
    }:
        raise pytest.UsageError("Anti-aliasing is always active in PIL")
    return F.resized_crop_image_pil(*args, **kwargs)


def reference_inputs_resized_crop_image_tensor():
    for image_loader, interpolation, params in itertools.product(
772
        make_image_loaders_for_interpolation(),
773
774
        [
            F.InterpolationMode.NEAREST,
775
            F.InterpolationMode.NEAREST_EXACT,
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
            F.InterpolationMode.BILINEAR,
            F.InterpolationMode.BICUBIC,
        ],
        _RESIZED_CROP_PARAMS,
    ):
        yield ArgsKwargs(
            image_loader,
            interpolation=interpolation,
            antialias=interpolation
            in {
                F.InterpolationMode.BILINEAR,
                F.InterpolationMode.BICUBIC,
            },
            **params,
        )


def sample_inputs_resized_crop_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **_RESIZED_CROP_PARAMS[0])


def sample_inputs_resized_crop_mask():
    for mask_loader in make_mask_loaders():
        yield ArgsKwargs(mask_loader, **_RESIZED_CROP_PARAMS[0])


803
804
805
806
807
def sample_inputs_resized_crop_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0])


808
809
810
811
812
813
814
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.resized_crop_image_tensor,
            sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
            reference_fn=reference_resized_crop_image_tensor,
            reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
815
            float32_vs_uint8=True,
816
            closeness_kwargs={
817
                **cuda_vs_cpu_pixel_difference(),
818
819
                **pil_reference_pixel_difference(3, mae=True),
                **float32_vs_uint8_pixel_difference(3, mae=True),
820
            },
821
822
823
824
825
826
827
828
829
        ),
        KernelInfo(
            F.resized_crop_bounding_box,
            sample_inputs_fn=sample_inputs_resized_crop_bounding_box,
        ),
        KernelInfo(
            F.resized_crop_mask,
            sample_inputs_fn=sample_inputs_resized_crop_mask,
        ),
830
831
832
        KernelInfo(
            F.resized_crop_video,
            sample_inputs_fn=sample_inputs_resized_crop_video,
833
            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
834
        ),
835
836
837
838
839
840
841
842
843
844
    ]
)

_PAD_PARAMS = combinations_grid(
    padding=[[1], [1, 1], [1, 1, 2, 2]],
    padding_mode=["constant", "symmetric", "edge", "reflect"],
)


def sample_inputs_pad_image_tensor():
845
    make_pad_image_loaders = functools.partial(
846
        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
847
848
849
850
851
852
853
854
855
    )

    for image_loader, padding in itertools.product(
        make_pad_image_loaders(),
        [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]],
    ):
        yield ArgsKwargs(image_loader, padding=padding)

    for image_loader in make_pad_image_loaders():
856
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
857
858
859
860
861
862
863
864
865
866
867
868
869
            yield ArgsKwargs(image_loader, padding=[1], fill=fill)

    for image_loader, padding_mode in itertools.product(
        # We branch for non-constant padding and integer inputs
        make_pad_image_loaders(dtypes=[torch.uint8]),
        ["constant", "symmetric", "edge", "reflect"],
    ):
        yield ArgsKwargs(image_loader, padding=[1], padding_mode=padding_mode)

    # `torch.nn.functional.pad` does not support symmetric padding, and thus we have a custom implementation. Besides
    # negative padding, this is already handled by the inputs above.
    for image_loader in make_pad_image_loaders():
        yield ArgsKwargs(image_loader, padding=[-1], padding_mode="symmetric")
870
871
872


def reference_inputs_pad_image_tensor():
873
874
875
876
877
878
879
    for image_loader, params in itertools.product(
        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PAD_PARAMS
    ):
        for fill in get_fills(
            num_channels=image_loader.num_channels,
            dtype=image_loader.dtype,
        ):
880
881
882
883
            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
            if isinstance(fill, (list, tuple)):
                continue

884
885
886
887
            yield ArgsKwargs(image_loader, fill=fill, **params)


def sample_inputs_pad_bounding_box():
888
889
890
    for bounding_box_loader, padding in itertools.product(
        make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
    ):
891
        yield ArgsKwargs(
892
893
            bounding_box_loader,
            format=bounding_box_loader.format,
894
            spatial_size=bounding_box_loader.spatial_size,
895
896
            padding=padding,
            padding_mode="constant",
897
        )
898
899
900


def sample_inputs_pad_mask():
901
902
    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
        yield ArgsKwargs(mask_loader, padding=[1])
903
904
905


def reference_inputs_pad_mask():
906
907
908
909
    for mask_loader, fill, params in itertools.product(
        make_mask_loaders(num_objects=[1], extra_dims=[()]), [None, 127], _PAD_PARAMS
    ):
        yield ArgsKwargs(mask_loader, fill=fill, **params)
910
911


912
913
914
915
916
def sample_inputs_pad_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, padding=[1])


917
918
919
920
921
922
923
924
925
def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, padding_mode):

    left, right, top, bottom = _parse_pad_padding(padding)

    affine_matrix = np.array(
        [
            [1, 0, left],
            [0, 1, top],
        ],
926
        dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
927
928
929
930
931
    )

    height = spatial_size[0] + top + bottom
    width = spatial_size[1] + left + right

932
933
934
    expected_bboxes = reference_affine_bounding_box_helper(
        bounding_box, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
    )
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
    return expected_bboxes, (height, width)


def reference_inputs_pad_bounding_box():
    for bounding_box_loader, padding in itertools.product(
        make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
    ):
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
            spatial_size=bounding_box_loader.spatial_size,
            padding=padding,
            padding_mode="constant",
        )


951
952
953
954
955
956
957
958
959
960
def pad_xfail_jit_fill_condition(args_kwargs):
    fill = args_kwargs.kwargs.get("fill")
    if not isinstance(fill, (list, tuple)):
        return False
    elif isinstance(fill, tuple):
        return True
    else:  # isinstance(fill, list):
        return all(isinstance(f, int) for f in fill)


961
962
963
964
965
966
967
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.pad_image_tensor,
            sample_inputs_fn=sample_inputs_pad_image_tensor,
            reference_fn=pil_reference_wrapper(F.pad_image_pil),
            reference_inputs_fn=reference_inputs_pad_image_tensor,
968
969
            float32_vs_uint8=float32_vs_uint8_fill_adapter,
            closeness_kwargs=float32_vs_uint8_pixel_difference(),
970
            test_marks=[
971
972
973
974
                xfail_jit_python_scalar_arg("padding"),
                xfail_jit(
                    "F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition
                ),
975
            ],
976
977
978
979
        ),
        KernelInfo(
            F.pad_bounding_box,
            sample_inputs_fn=sample_inputs_pad_bounding_box,
980
981
            reference_fn=reference_pad_bounding_box,
            reference_inputs_fn=reference_inputs_pad_bounding_box,
982
            test_marks=[
983
                xfail_jit_python_scalar_arg("padding"),
984
            ],
985
986
987
988
989
990
        ),
        KernelInfo(
            F.pad_mask,
            sample_inputs_fn=sample_inputs_pad_mask,
            reference_fn=pil_reference_wrapper(F.pad_image_pil),
            reference_inputs_fn=reference_inputs_pad_mask,
991
            float32_vs_uint8=float32_vs_uint8_fill_adapter,
992
        ),
993
994
995
996
        KernelInfo(
            F.pad_video,
            sample_inputs_fn=sample_inputs_pad_video,
        ),
997
998
999
1000
1001
1002
1003
    ]
)

_PERSPECTIVE_COEFFS = [
    [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
    [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
]
1004
1005
_STARTPOINTS = [[0, 1], [2, 3], [4, 5], [6, 7]]
_ENDPOINTS = [[9, 8], [7, 6], [5, 4], [3, 2]]
1006
1007
1008


def sample_inputs_perspective_image_tensor():
1009
    for image_loader in make_image_loaders(sizes=["random"]):
1010
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
1011
1012
1013
1014
1015
            yield ArgsKwargs(
                image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]
            )

    yield ArgsKwargs(make_image_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
1016
1017
1018


def reference_inputs_perspective_image_tensor():
1019
1020
1021
1022
1023
1024
1025
    for image_loader, coefficients, interpolation in itertools.product(
        make_image_loaders_for_interpolation(),
        _PERSPECTIVE_COEFFS,
        [
            F.InterpolationMode.NEAREST,
            F.InterpolationMode.BILINEAR,
        ],
1026
1027
    ):
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
1028
1029
1030
1031
            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
            if isinstance(fill, (list, tuple)):
                continue

1032
1033
1034
1035
1036
1037
1038
1039
            yield ArgsKwargs(
                image_loader,
                startpoints=None,
                endpoints=None,
                interpolation=interpolation,
                fill=fill,
                coefficients=coefficients,
            )
1040
1041
1042
1043
1044


def sample_inputs_perspective_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
        yield ArgsKwargs(
1045
1046
            bounding_box_loader,
            format=bounding_box_loader.format,
1047
            spatial_size=bounding_box_loader.spatial_size,
1048
1049
1050
            startpoints=None,
            endpoints=None,
            coefficients=_PERSPECTIVE_COEFFS[0],
1051
1052
        )

1053
    format = datapoints.BoundingBoxFormat.XYXY
1054
    loader = make_bounding_box_loader(format=format)
1055
    yield ArgsKwargs(
1056
        loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
1057
1058
    )

1059
1060

def sample_inputs_perspective_mask():
1061
    for mask_loader in make_mask_loaders(sizes=["random"]):
1062
1063
1064
        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])

    yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
1065
1066
1067
1068
1069
1070


def reference_inputs_perspective_mask():
    for mask_loader, perspective_coeffs in itertools.product(
        make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
    ):
1071
        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=perspective_coeffs)
1072
1073


1074
1075
def sample_inputs_perspective_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
1076
1077
1078
        yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])

    yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
1079
1080


1081
1082
1083
1084
1085
1086
1087
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.perspective_image_tensor,
            sample_inputs_fn=sample_inputs_perspective_image_tensor,
            reference_fn=pil_reference_wrapper(F.perspective_image_pil),
            reference_inputs_fn=reference_inputs_perspective_image_tensor,
1088
            float32_vs_uint8=float32_vs_uint8_fill_adapter,
1089
            closeness_kwargs={
1090
                **pil_reference_pixel_difference(2, mae=True),
1091
1092
                **cuda_vs_cpu_pixel_difference(),
                **float32_vs_uint8_pixel_difference(),
1093
1094
                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
1095
            },
1096
            test_marks=[xfail_jit_python_scalar_arg("fill")],
1097
1098
1099
1100
        ),
        KernelInfo(
            F.perspective_bounding_box,
            sample_inputs_fn=sample_inputs_perspective_bounding_box,
1101
1102
1103
1104
            closeness_kwargs={
                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
            },
1105
1106
1107
1108
1109
1110
        ),
        KernelInfo(
            F.perspective_mask,
            sample_inputs_fn=sample_inputs_perspective_mask,
            reference_fn=pil_reference_wrapper(F.perspective_image_pil),
            reference_inputs_fn=reference_inputs_perspective_mask,
1111
1112
1113
1114
            float32_vs_uint8=True,
            closeness_kwargs={
                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=10, rtol=0),
            },
1115
1116
1117
1118
        ),
        KernelInfo(
            F.perspective_video,
            sample_inputs_fn=sample_inputs_perspective_video,
1119
1120
            closeness_kwargs={
                **cuda_vs_cpu_pixel_difference(),
1121
1122
                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
1123
            },
1124
1125
1126
1127
1128
        ),
    ]
)


1129
1130
def _get_elastic_displacement(spatial_size):
    return torch.rand(1, *spatial_size, 2)
1131
1132
1133


def sample_inputs_elastic_image_tensor():
1134
    for image_loader in make_image_loaders(sizes=["random"]):
1135
        displacement = _get_elastic_displacement(image_loader.spatial_size)
1136
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
1137
1138
1139
1140
1141
            yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)


def reference_inputs_elastic_image_tensor():
    for image_loader, interpolation in itertools.product(
1142
        make_image_loaders_for_interpolation(),
1143
1144
1145
1146
1147
1148
        [
            F.InterpolationMode.NEAREST,
            F.InterpolationMode.BILINEAR,
            F.InterpolationMode.BICUBIC,
        ],
    ):
1149
        displacement = _get_elastic_displacement(image_loader.spatial_size)
1150
        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
1151
1152
1153
1154
1155
            yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)


def sample_inputs_elastic_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
1156
        displacement = _get_elastic_displacement(bounding_box_loader.spatial_size)
1157
1158
1159
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
1160
            spatial_size=bounding_box_loader.spatial_size,
1161
1162
1163
1164
1165
            displacement=displacement,
        )


def sample_inputs_elastic_mask():
1166
    for mask_loader in make_mask_loaders(sizes=["random"]):
1167
1168
1169
1170
        displacement = _get_elastic_displacement(mask_loader.shape[-2:])
        yield ArgsKwargs(mask_loader, displacement=displacement)


1171
1172
1173
1174
1175
1176
def sample_inputs_elastic_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        displacement = _get_elastic_displacement(video_loader.shape[-2:])
        yield ArgsKwargs(video_loader, displacement=displacement)


1177
1178
1179
1180
1181
1182
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.elastic_image_tensor,
            sample_inputs_fn=sample_inputs_elastic_image_tensor,
            reference_inputs_fn=reference_inputs_elastic_image_tensor,
1183
            float32_vs_uint8=float32_vs_uint8_fill_adapter,
1184
            closeness_kwargs={
1185
                **float32_vs_uint8_pixel_difference(6, mae=True),
1186
1187
                **cuda_vs_cpu_pixel_difference(),
            },
1188
            test_marks=[xfail_jit_python_scalar_arg("fill")],
1189
1190
1191
1192
1193
1194
1195
1196
        ),
        KernelInfo(
            F.elastic_bounding_box,
            sample_inputs_fn=sample_inputs_elastic_bounding_box,
        ),
        KernelInfo(
            F.elastic_mask,
            sample_inputs_fn=sample_inputs_elastic_mask,
1197
1198
1199
1200
        ),
        KernelInfo(
            F.elastic_video,
            sample_inputs_fn=sample_inputs_elastic_video,
1201
            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
1202
1203
1204
1205
1206
        ),
    ]
)


1207
_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)]
1208
_CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
1209
1210
1211
1212


def sample_inputs_center_crop_image_tensor():
    for image_loader, output_size in itertools.product(
1213
        make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]),
1214
1215
1216
1217
1218
1219
        [
            # valid `output_size` types for which cropping is applied to both dimensions
            *[5, (4,), (2, 3), [6], [3, 2]],
            # `output_size`'s for which at least one dimension needs to be padded
            *[[4, 18], [17, 5], [17, 18]],
        ],
1220
1221
1222
1223
1224
1225
    ):
        yield ArgsKwargs(image_loader, output_size=output_size)


def reference_inputs_center_crop_image_tensor():
    for image_loader, output_size in itertools.product(
1226
1227
        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], dtypes=[torch.uint8]),
        _CENTER_CROP_OUTPUT_SIZES,
1228
1229
1230
1231
1232
1233
1234
1235
1236
    ):
        yield ArgsKwargs(image_loader, output_size=output_size)


def sample_inputs_center_crop_bounding_box():
    for bounding_box_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES):
        yield ArgsKwargs(
            bounding_box_loader,
            format=bounding_box_loader.format,
1237
            spatial_size=bounding_box_loader.spatial_size,
1238
1239
1240
1241
1242
            output_size=output_size,
        )


def sample_inputs_center_crop_mask():
1243
1244
1245
    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
        height, width = mask_loader.shape[-2:]
        yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2))
1246
1247
1248
1249


def reference_inputs_center_crop_mask():
    for mask_loader, output_size in itertools.product(
1250
        make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
1251
1252
1253
1254
    ):
        yield ArgsKwargs(mask_loader, output_size=output_size)


1255
1256
1257
1258
1259
1260
def sample_inputs_center_crop_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        height, width = video_loader.shape[-2:]
        yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2))


1261
1262
1263
1264
1265
1266
1267
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.center_crop_image_tensor,
            sample_inputs_fn=sample_inputs_center_crop_image_tensor,
            reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
            reference_inputs_fn=reference_inputs_center_crop_image_tensor,
1268
            float32_vs_uint8=True,
1269
            test_marks=[
1270
                xfail_jit_python_scalar_arg("output_size"),
1271
            ],
1272
1273
1274
1275
        ),
        KernelInfo(
            F.center_crop_bounding_box,
            sample_inputs_fn=sample_inputs_center_crop_bounding_box,
1276
            test_marks=[
1277
                xfail_jit_python_scalar_arg("output_size"),
1278
            ],
1279
1280
1281
1282
1283
1284
        ),
        KernelInfo(
            F.center_crop_mask,
            sample_inputs_fn=sample_inputs_center_crop_mask,
            reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
            reference_inputs_fn=reference_inputs_center_crop_mask,
1285
            float32_vs_uint8=True,
1286
            test_marks=[
1287
                xfail_jit_python_scalar_arg("output_size"),
1288
            ],
1289
        ),
1290
1291
1292
1293
        KernelInfo(
            F.center_crop_video,
            sample_inputs_fn=sample_inputs_center_crop_video,
        ),
1294
1295
1296
1297
1298
    ]
)


def sample_inputs_gaussian_blur_image_tensor():
1299
    make_gaussian_blur_image_loaders = functools.partial(make_image_loaders, sizes=[(7, 33)], color_spaces=["RGB"])
1300
1301
1302
1303
1304
1305

    for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
        yield ArgsKwargs(image_loader, kernel_size=kernel_size)

    for image_loader, sigma in itertools.product(
        make_gaussian_blur_image_loaders(), [None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)]
1306
    ):
1307
        yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma)
1308
1309


1310
def sample_inputs_gaussian_blur_video():
1311
    for video_loader in make_video_loaders(sizes=[(7, 33)], num_frames=[5]):
1312
1313
1314
1315
1316
1317
1318
1319
        yield ArgsKwargs(video_loader, kernel_size=[3, 3])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.gaussian_blur_image_tensor,
            sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
1320
            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
1321
1322
1323
1324
1325
1326
1327
1328
            test_marks=[
                xfail_jit_python_scalar_arg("kernel_size"),
                xfail_jit_python_scalar_arg("sigma"),
            ],
        ),
        KernelInfo(
            F.gaussian_blur_video,
            sample_inputs_fn=sample_inputs_gaussian_blur_video,
1329
            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
1330
1331
        ),
    ]
1332
1333
1334
1335
)


def sample_inputs_equalize_image_tensor():
1336
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1337
1338
1339
1340
        yield ArgsKwargs(image_loader)


def reference_inputs_equalize_image_tensor():
1341
1342
1343
    # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range.
    # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one,
    # the information gain is low if we already provide something really close to the expected value.
1344
    def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor, memory_format):
1345
1346
1347
1348
1349
1350
1351
        if dtype.is_floating_point:
            low = low_factor
            high = high_factor
        else:
            max_value = torch.iinfo(dtype).max
            low = int(low_factor * max_value)
            high = int(high_factor * max_value)
1352
1353
1354
        return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high).to(
            memory_format=memory_format, copy=True
        )
1355

1356
    def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_format):
1357
1358
1359
        image = torch.distributions.Beta(alpha, beta).sample(shape)
        if not dtype.is_floating_point:
            image.mul_(torch.iinfo(dtype).max).round_()
1360
        return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True)
1361

1362
    spatial_size = (256, 256)
1363
    for dtype, color_space, fn in itertools.product(
1364
        [torch.uint8],
1365
        ["GRAY", "RGB"],
1366
        [
1367
1368
            lambda shape, dtype, device, memory_format: torch.zeros(shape, dtype=dtype, device=device).to(
                memory_format=memory_format, copy=True
1369
            ),
1370
1371
1372
            lambda shape, dtype, device, memory_format: torch.full(
                shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device
            ).to(memory_format=memory_format, copy=True),
1373
            *[
1374
1375
1376
1377
1378
                functools.partial(make_uniform_band_image, low_factor=low_factor, high_factor=high_factor)
                for low_factor, high_factor in [
                    (0.0, 0.25),
                    (0.25, 0.75),
                    (0.75, 1.0),
1379
1380
1381
                ]
            ],
            *[
1382
                functools.partial(make_beta_distributed_image, alpha=alpha, beta=beta)
1383
1384
1385
1386
1387
1388
1389
1390
                for alpha, beta in [
                    (0.5, 0.5),
                    (2, 2),
                    (2, 5),
                    (5, 2),
                ]
            ],
        ],
1391
    ):
1392
        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype)
1393
1394
1395
        yield ArgsKwargs(image_loader)


1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
def sample_inputs_equalize_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader)


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.equalize_image_tensor,
            kernel_name="equalize_image_tensor",
            sample_inputs_fn=sample_inputs_equalize_image_tensor,
            reference_fn=pil_reference_wrapper(F.equalize_image_pil),
1408
            float32_vs_uint8=True,
1409
1410
1411
1412
1413
1414
1415
            reference_inputs_fn=reference_inputs_equalize_image_tensor,
        ),
        KernelInfo(
            F.equalize_video,
            sample_inputs_fn=sample_inputs_equalize_video,
        ),
    ]
1416
1417
1418
1419
)


def sample_inputs_invert_image_tensor():
1420
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1421
1422
1423
1424
        yield ArgsKwargs(image_loader)


def reference_inputs_invert_image_tensor():
1425
    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
1426
1427
1428
        yield ArgsKwargs(image_loader)


1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
def sample_inputs_invert_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader)


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.invert_image_tensor,
            kernel_name="invert_image_tensor",
            sample_inputs_fn=sample_inputs_invert_image_tensor,
            reference_fn=pil_reference_wrapper(F.invert_image_pil),
            reference_inputs_fn=reference_inputs_invert_image_tensor,
1442
            float32_vs_uint8=True,
1443
1444
1445
1446
1447
1448
        ),
        KernelInfo(
            F.invert_video,
            sample_inputs_fn=sample_inputs_invert_video,
        ),
    ]
1449
1450
1451
1452
1453
1454
1455
)


_POSTERIZE_BITS = [1, 4, 8]


def sample_inputs_posterize_image_tensor():
1456
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1457
1458
1459
1460
1461
        yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])


def reference_inputs_posterize_image_tensor():
    for image_loader, bits in itertools.product(
1462
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1463
1464
1465
1466
1467
        _POSTERIZE_BITS,
    ):
        yield ArgsKwargs(image_loader, bits=bits)


1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
def sample_inputs_posterize_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.posterize_image_tensor,
            kernel_name="posterize_image_tensor",
            sample_inputs_fn=sample_inputs_posterize_image_tensor,
            reference_fn=pil_reference_wrapper(F.posterize_image_pil),
            reference_inputs_fn=reference_inputs_posterize_image_tensor,
1481
1482
            float32_vs_uint8=True,
            closeness_kwargs=float32_vs_uint8_pixel_difference(),
1483
1484
1485
1486
1487
1488
        ),
        KernelInfo(
            F.posterize_video,
            sample_inputs_fn=sample_inputs_posterize_video,
        ),
    ]
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
)


def _get_solarize_thresholds(dtype):
    for factor in [0.1, 0.5]:
        max_value = get_max_value(dtype)
        yield (float if dtype.is_floating_point else int)(max_value * factor)


def sample_inputs_solarize_image_tensor():
1499
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1500
1501
1502
1503
        yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))


def reference_inputs_solarize_image_tensor():
1504
    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
1505
1506
1507
1508
        for threshold in _get_solarize_thresholds(image_loader.dtype):
            yield ArgsKwargs(image_loader, threshold=threshold)


1509
1510
1511
1512
def uint8_to_float32_threshold_adapter(other_args, kwargs):
    return other_args, dict(threshold=kwargs["threshold"] / 255)


1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
def sample_inputs_solarize_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype)))


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.solarize_image_tensor,
            kernel_name="solarize_image_tensor",
            sample_inputs_fn=sample_inputs_solarize_image_tensor,
            reference_fn=pil_reference_wrapper(F.solarize_image_pil),
            reference_inputs_fn=reference_inputs_solarize_image_tensor,
1526
1527
            float32_vs_uint8=uint8_to_float32_threshold_adapter,
            closeness_kwargs=float32_vs_uint8_pixel_difference(),
1528
1529
1530
1531
1532
1533
        ),
        KernelInfo(
            F.solarize_video,
            sample_inputs_fn=sample_inputs_solarize_video,
        ),
    ]
1534
1535
1536
1537
)


def sample_inputs_autocontrast_image_tensor():
1538
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1539
1540
1541
1542
        yield ArgsKwargs(image_loader)


def reference_inputs_autocontrast_image_tensor():
1543
    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
1544
1545
1546
        yield ArgsKwargs(image_loader)


1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
def sample_inputs_autocontrast_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader)


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.autocontrast_image_tensor,
            kernel_name="autocontrast_image_tensor",
            sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
            reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
            reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
1560
1561
1562
1563
1564
            float32_vs_uint8=True,
            closeness_kwargs={
                **pil_reference_pixel_difference(),
                **float32_vs_uint8_pixel_difference(),
            },
1565
1566
1567
1568
1569
1570
        ),
        KernelInfo(
            F.autocontrast_video,
            sample_inputs_fn=sample_inputs_autocontrast_video,
        ),
    ]
1571
1572
1573
1574
1575
1576
1577
1578
)

_ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]


def sample_inputs_adjust_sharpness_image_tensor():
    for image_loader in make_image_loaders(
        sizes=["random", (2, 2)],
1579
        color_spaces=("GRAY", "RGB"),
1580
1581
1582
1583
1584
1585
    ):
        yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])


def reference_inputs_adjust_sharpness_image_tensor():
    for image_loader, sharpness_factor in itertools.product(
1586
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1587
1588
1589
1590
1591
        _ADJUST_SHARPNESS_FACTORS,
    ):
        yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor)


1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
def sample_inputs_adjust_sharpness_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.adjust_sharpness_image_tensor,
            kernel_name="adjust_sharpness_image_tensor",
            sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
            reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
            reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
1605
1606
            float32_vs_uint8=True,
            closeness_kwargs=float32_vs_uint8_pixel_difference(2),
1607
1608
1609
1610
1611
1612
        ),
        KernelInfo(
            F.adjust_sharpness_video,
            sample_inputs_fn=sample_inputs_adjust_sharpness_video,
        ),
    ]
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
)


def sample_inputs_erase_image_tensor():
    for image_loader in make_image_loaders(sizes=["random"]):
        # FIXME: make the parameters more diverse
        h, w = 6, 7
        v = torch.rand(image_loader.num_channels, h, w)
        yield ArgsKwargs(image_loader, i=1, j=2, h=h, w=w, v=v)


1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
def sample_inputs_erase_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        # FIXME: make the parameters more diverse
        h, w = 6, 7
        v = torch.rand(video_loader.num_channels, h, w)
        yield ArgsKwargs(video_loader, i=1, j=2, h=h, w=w, v=v)


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.erase_image_tensor,
            kernel_name="erase_image_tensor",
            sample_inputs_fn=sample_inputs_erase_image_tensor,
        ),
        KernelInfo(
            F.erase_video,
            sample_inputs_fn=sample_inputs_erase_video,
        ),
    ]
1644
)
1645
1646
1647
1648
1649

_ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]


def sample_inputs_adjust_brightness_image_tensor():
1650
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1651
1652
1653
1654
1655
        yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])


def reference_inputs_adjust_brightness_image_tensor():
    for image_loader, brightness_factor in itertools.product(
1656
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1657
1658
1659
1660
1661
        _ADJUST_BRIGHTNESS_FACTORS,
    ):
        yield ArgsKwargs(image_loader, brightness_factor=brightness_factor)


1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
def sample_inputs_adjust_brightness_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.adjust_brightness_image_tensor,
            kernel_name="adjust_brightness_image_tensor",
            sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
            reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
            reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
1675
1676
            float32_vs_uint8=True,
            closeness_kwargs=float32_vs_uint8_pixel_difference(),
1677
1678
1679
1680
1681
1682
        ),
        KernelInfo(
            F.adjust_brightness_video,
            sample_inputs_fn=sample_inputs_adjust_brightness_video,
        ),
    ]
1683
1684
1685
1686
1687
1688
1689
)


_ADJUST_CONTRAST_FACTORS = [0.1, 0.5]


def sample_inputs_adjust_contrast_image_tensor():
1690
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1691
1692
1693
1694
1695
        yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])


def reference_inputs_adjust_contrast_image_tensor():
    for image_loader, contrast_factor in itertools.product(
1696
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1697
1698
1699
1700
1701
        _ADJUST_CONTRAST_FACTORS,
    ):
        yield ArgsKwargs(image_loader, contrast_factor=contrast_factor)


1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
def sample_inputs_adjust_contrast_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.adjust_contrast_image_tensor,
            kernel_name="adjust_contrast_image_tensor",
            sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
            reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
            reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
1715
1716
1717
1718
            float32_vs_uint8=True,
            closeness_kwargs={
                **pil_reference_pixel_difference(),
                **float32_vs_uint8_pixel_difference(2),
1719
                **cuda_vs_cpu_pixel_difference(),
1720
                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1),
1721
            },
1722
1723
1724
1725
        ),
        KernelInfo(
            F.adjust_contrast_video,
            sample_inputs_fn=sample_inputs_adjust_contrast_video,
1726
1727
1728
1729
            closeness_kwargs={
                **cuda_vs_cpu_pixel_difference(),
                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1),
            },
1730
1731
        ),
    ]
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
)

_ADJUST_GAMMA_GAMMAS_GAINS = [
    (0.5, 2.0),
    (0.0, 1.0),
]


def sample_inputs_adjust_gamma_image_tensor():
    gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
1742
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1743
1744
1745
1746
1747
        yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)


def reference_inputs_adjust_gamma_image_tensor():
    for image_loader, (gamma, gain) in itertools.product(
1748
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1749
1750
1751
1752
1753
        _ADJUST_GAMMA_GAMMAS_GAINS,
    ):
        yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)


1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
def sample_inputs_adjust_gamma_video():
    gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, gamma=gamma, gain=gain)


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.adjust_gamma_image_tensor,
            kernel_name="adjust_gamma_image_tensor",
            sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
            reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
            reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
1768
1769
1770
1771
1772
            float32_vs_uint8=True,
            closeness_kwargs={
                **pil_reference_pixel_difference(),
                **float32_vs_uint8_pixel_difference(),
            },
1773
1774
1775
1776
1777
1778
        ),
        KernelInfo(
            F.adjust_gamma_video,
            sample_inputs_fn=sample_inputs_adjust_gamma_video,
        ),
    ]
1779
1780
1781
1782
1783
1784
1785
)


_ADJUST_HUE_FACTORS = [-0.1, 0.5]


def sample_inputs_adjust_hue_image_tensor():
1786
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1787
1788
1789
1790
1791
        yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])


def reference_inputs_adjust_hue_image_tensor():
    for image_loader, hue_factor in itertools.product(
1792
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1793
1794
1795
1796
1797
        _ADJUST_HUE_FACTORS,
    ):
        yield ArgsKwargs(image_loader, hue_factor=hue_factor)


1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
def sample_inputs_adjust_hue_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.adjust_hue_image_tensor,
            kernel_name="adjust_hue_image_tensor",
            sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
            reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
            reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
1811
1812
            float32_vs_uint8=True,
            closeness_kwargs={
1813
                **pil_reference_pixel_difference(2, mae=True),
1814
1815
                **float32_vs_uint8_pixel_difference(),
            },
1816
1817
1818
1819
1820
1821
        ),
        KernelInfo(
            F.adjust_hue_video,
            sample_inputs_fn=sample_inputs_adjust_hue_video,
        ),
    ]
1822
1823
1824
1825
1826
1827
)

_ADJUST_SATURATION_FACTORS = [0.1, 0.5]


def sample_inputs_adjust_saturation_image_tensor():
1828
    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
1829
1830
1831
1832
1833
        yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])


def reference_inputs_adjust_saturation_image_tensor():
    for image_loader, saturation_factor in itertools.product(
1834
        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
1835
1836
1837
1838
1839
        _ADJUST_SATURATION_FACTORS,
    ):
        yield ArgsKwargs(image_loader, saturation_factor=saturation_factor)


1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
def sample_inputs_adjust_saturation_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.adjust_saturation_image_tensor,
            kernel_name="adjust_saturation_image_tensor",
            sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
            reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
            reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
1853
1854
1855
1856
            float32_vs_uint8=True,
            closeness_kwargs={
                **pil_reference_pixel_difference(),
                **float32_vs_uint8_pixel_difference(2),
1857
                **cuda_vs_cpu_pixel_difference(),
1858
            },
1859
1860
1861
1862
        ),
        KernelInfo(
            F.adjust_saturation_video,
            sample_inputs_fn=sample_inputs_adjust_saturation_video,
1863
            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
1864
1865
        ),
    ]
1866
1867
1868
1869
1870
1871
)


def sample_inputs_clamp_bounding_box():
    for bounding_box_loader in make_bounding_box_loaders():
        yield ArgsKwargs(
1872
            bounding_box_loader,
1873
1874
            format=bounding_box_loader.format,
            spatial_size=bounding_box_loader.spatial_size,
1875
1876
1877
1878
1879
1880
1881
        )


KERNEL_INFOS.append(
    KernelInfo(
        F.clamp_bounding_box,
        sample_inputs_fn=sample_inputs_clamp_bounding_box,
1882
        logs_usage=True,
1883
1884
1885
1886
1887
1888
    )
)

_FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]


1889
def _get_five_ten_crop_spatial_size(size):
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
    if isinstance(size, int):
        crop_height = crop_width = size
    elif len(size) == 1:
        crop_height = crop_width = size[0]
    else:
        crop_height, crop_width = size
    return 2 * crop_height, 2 * crop_width


def sample_inputs_five_crop_image_tensor():
    for size in _FIVE_TEN_CROP_SIZES:
1901
        for image_loader in make_image_loaders(
1902
            sizes=[_get_five_ten_crop_spatial_size(size)],
1903
            color_spaces=["RGB"],
1904
            dtypes=[torch.float32],
1905
        ):
1906
1907
1908
1909
1910
            yield ArgsKwargs(image_loader, size=size)


def reference_inputs_five_crop_image_tensor():
    for size in _FIVE_TEN_CROP_SIZES:
1911
1912
1913
        for image_loader in make_image_loaders(
            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
        ):
1914
1915
1916
            yield ArgsKwargs(image_loader, size=size)


1917
1918
1919
1920
1921
1922
def sample_inputs_five_crop_video():
    size = _FIVE_TEN_CROP_SIZES[0]
    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
        yield ArgsKwargs(video_loader, size=size)


1923
1924
def sample_inputs_ten_crop_image_tensor():
    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
1925
        for image_loader in make_image_loaders(
1926
            sizes=[_get_five_ten_crop_spatial_size(size)],
1927
            color_spaces=["RGB"],
1928
            dtypes=[torch.float32],
1929
        ):
1930
1931
1932
1933
1934
            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)


def reference_inputs_ten_crop_image_tensor():
    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
1935
1936
1937
        for image_loader in make_image_loaders(
            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
        ):
1938
1939
1940
            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)


1941
1942
1943
1944
1945
1946
def sample_inputs_ten_crop_video():
    size = _FIVE_TEN_CROP_SIZES[0]
    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
        yield ArgsKwargs(video_loader, size=size)


1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
def multi_crop_pil_reference_wrapper(pil_kernel):
    def wrapper(input_tensor, *other_args, **kwargs):
        output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs)
        return type(output)(
            F.convert_dtype_image_tensor(F.to_image_tensor(output_pil), dtype=input_tensor.dtype)
            for output_pil in output
        )

    return wrapper


1958
1959
1960
1961
1962
_common_five_ten_crop_marks = [
    xfail_jit_python_scalar_arg("size"),
    mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
]

1963
1964
1965
1966
1967
KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.five_crop_image_tensor,
            sample_inputs_fn=sample_inputs_five_crop_image_tensor,
1968
            reference_fn=multi_crop_pil_reference_wrapper(F.five_crop_image_pil),
1969
            reference_inputs_fn=reference_inputs_five_crop_image_tensor,
1970
            test_marks=_common_five_ten_crop_marks,
1971
        ),
1972
1973
1974
1975
1976
        KernelInfo(
            F.five_crop_video,
            sample_inputs_fn=sample_inputs_five_crop_video,
            test_marks=_common_five_ten_crop_marks,
        ),
1977
1978
1979
        KernelInfo(
            F.ten_crop_image_tensor,
            sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
1980
            reference_fn=multi_crop_pil_reference_wrapper(F.ten_crop_image_pil),
1981
            reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
1982
            test_marks=_common_five_ten_crop_marks,
1983
        ),
1984
1985
1986
1987
1988
        KernelInfo(
            F.ten_crop_video,
            sample_inputs_fn=sample_inputs_ten_crop_video,
            test_marks=_common_five_ten_crop_marks,
        ),
1989
1990
1991
1992
1993
1994
    ]
)

_NORMALIZE_MEANS_STDS = [
    ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]),
1995
    (0.5, 2.0),
1996
1997
1998
1999
2000
]


def sample_inputs_normalize_image_tensor():
    for image_loader, (mean, std) in itertools.product(
2001
        make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]),
2002
2003
2004
2005
2006
        _NORMALIZE_MEANS_STDS,
    ):
        yield ArgsKwargs(image_loader, mean=mean, std=std)


2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
def reference_normalize_image_tensor(image, mean, std, inplace=False):
    mean = torch.tensor(mean).view(-1, 1, 1)
    std = torch.tensor(std).view(-1, 1, 1)

    sub = torch.Tensor.sub_ if inplace else torch.Tensor.sub
    return sub(image, mean).div_(std)


def reference_inputs_normalize_image_tensor():
    yield ArgsKwargs(
2017
        make_image_loader(size=(32, 32), color_space="RGB", extra_dims=[1]),
2018
2019
2020
2021
2022
        mean=[0.5, 0.5, 0.5],
        std=[1.0, 1.0, 1.0],
    )


2023
2024
2025
def sample_inputs_normalize_video():
    mean, std = _NORMALIZE_MEANS_STDS[0]
    for video_loader in make_video_loaders(
2026
        sizes=["random"], color_spaces=["RGB"], num_frames=["random"], dtypes=[torch.float32]
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
    ):
        yield ArgsKwargs(video_loader, mean=mean, std=std)


KERNEL_INFOS.extend(
    [
        KernelInfo(
            F.normalize_image_tensor,
            kernel_name="normalize_image_tensor",
            sample_inputs_fn=sample_inputs_normalize_image_tensor,
2037
2038
            reference_fn=reference_normalize_image_tensor,
            reference_inputs_fn=reference_inputs_normalize_image_tensor,
2039
2040
2041
2042
            test_marks=[
                xfail_jit_python_scalar_arg("mean"),
                xfail_jit_python_scalar_arg("std"),
            ],
2043
2044
2045
2046
2047
2048
        ),
        KernelInfo(
            F.normalize_video,
            sample_inputs_fn=sample_inputs_normalize_video,
        ),
    ]
2049
)
2050
2051


2052
def sample_inputs_convert_dtype_image_tensor():
2053
2054
2055
2056
2057
2058
2059
    for input_dtype, output_dtype in itertools.product(
        [torch.uint8, torch.int64, torch.float32, torch.float64], repeat=2
    ):
        if input_dtype.is_floating_point and output_dtype == torch.int64:
            # conversion cannot be performed safely
            continue

2060
        for image_loader in make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[input_dtype]):
2061
2062
2063
            yield ArgsKwargs(image_loader, dtype=output_dtype)


2064
def reference_convert_dtype_image_tensor(image, dtype=torch.float):
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
    input_dtype = image.dtype
    output_dtype = dtype

    if output_dtype == input_dtype:
        return image

    def fn(value):
        if input_dtype.is_floating_point:
            if output_dtype.is_floating_point:
                return value
            else:
                return int(decimal.Decimal(value) * torch.iinfo(output_dtype).max)
        else:
            input_max_value = torch.iinfo(input_dtype).max

            if output_dtype.is_floating_point:
                return float(decimal.Decimal(value) / input_max_value)
            else:
                output_max_value = torch.iinfo(output_dtype).max

                if input_max_value > output_max_value:
                    factor = (input_max_value + 1) // (output_max_value + 1)
                    return value // factor
                else:
                    factor = (output_max_value + 1) // (input_max_value + 1)
                    return value * factor

    return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype)


2095
def reference_inputs_convert_dtype_image_tensor():
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
    for input_dtype, output_dtype in itertools.product(
        [
            torch.uint8,
            torch.int16,
            torch.int32,
            torch.int64,
            torch.float16,
            torch.float32,
            torch.float64,
            torch.bfloat16,
        ],
        repeat=2,
    ):
        if (input_dtype == torch.float32 and output_dtype in {torch.int32, torch.int64}) or (
            input_dtype == torch.float64 and output_dtype == torch.int64
        ):
            continue

        if input_dtype.is_floating_point:
            data = [0.0, 0.5, 1.0]
        else:
            max_value = torch.iinfo(input_dtype).max
            data = [0, max_value // 2, max_value]
        image = torch.tensor(data, dtype=input_dtype)

        yield ArgsKwargs(image, dtype=output_dtype)


2124
2125
2126
2127
2128
def sample_inputs_convert_dtype_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
        yield ArgsKwargs(video_loader)


2129
2130
2131
2132
2133
skip_dtype_consistency = TestMark(
    ("TestKernels", "test_dtype_and_device_consistency"),
    pytest.mark.skip(reason="`convert_dtype_*` kernels convert the dtype by design"),
    condition=lambda args_kwargs: args_kwargs.args[0].dtype != args_kwargs.kwargs.get("dtype", torch.float32),
)
2134

2135
2136
2137
KERNEL_INFOS.extend(
    [
        KernelInfo(
2138
2139
2140
2141
            F.convert_dtype_image_tensor,
            sample_inputs_fn=sample_inputs_convert_dtype_image_tensor,
            reference_fn=reference_convert_dtype_image_tensor,
            reference_inputs_fn=reference_inputs_convert_dtype_image_tensor,
2142
            test_marks=[
2143
                skip_dtype_consistency,
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
                TestMark(
                    ("TestKernels", "test_against_reference"),
                    pytest.mark.xfail(reason="Conversion overflows"),
                    condition=lambda args_kwargs: (
                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
                        and not args_kwargs.kwargs["dtype"].is_floating_point
                    )
                    or (
                        args_kwargs.args[0].dtype in {torch.int32, torch.int64}
                        and args_kwargs.kwargs["dtype"] == torch.float16
                    ),
                ),
            ],
        ),
2158
2159
2160
        KernelInfo(
            F.convert_dtype_video,
            sample_inputs_fn=sample_inputs_convert_dtype_video,
2161
2162
2163
            test_marks=[
                skip_dtype_consistency,
            ],
2164
        ),
2165
2166
    ]
)
2167
2168
2169
2170


def sample_inputs_uniform_temporal_subsample_video():
    for video_loader in make_video_loaders(sizes=["random"], num_frames=[4]):
2171
        yield ArgsKwargs(video_loader, num_samples=2)
2172
2173


2174
def reference_uniform_temporal_subsample_video(x, num_samples):
2175
2176
    # Copy-pasted from
    # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19
2177
    t = x.shape[-4]
2178
2179
2180
2181
    assert num_samples > 0 and t > 0
    # Sample by nearest neighbor interpolation if num_samples > t.
    indices = torch.linspace(0, t - 1, num_samples)
    indices = torch.clamp(indices, 0, t - 1).long()
2182
    return torch.index_select(x, -4, indices)
2183
2184
2185


def reference_inputs_uniform_temporal_subsample_video():
2186
    for video_loader in make_video_loaders(sizes=["random"], color_spaces=["RGB"], num_frames=[10]):
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
        for num_samples in range(1, video_loader.shape[-4] + 1):
            yield ArgsKwargs(video_loader, num_samples)


KERNEL_INFOS.append(
    KernelInfo(
        F.uniform_temporal_subsample_video,
        sample_inputs_fn=sample_inputs_uniform_temporal_subsample_video,
        reference_fn=reference_uniform_temporal_subsample_video,
        reference_inputs_fn=reference_inputs_uniform_temporal_subsample_video,
    )
)