test_image_processing_yolos.py 26.5 KB
Newer Older
NielsRogge's avatar
NielsRogge committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# coding=utf-8
# Copyright 2021 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import pathlib
import unittest

21
22
from parameterized import parameterized

NielsRogge's avatar
NielsRogge committed
23
24
25
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available

26
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
NielsRogge's avatar
NielsRogge committed
27
28
29
30
31
32
33
34


if is_torch_available():
    import torch

if is_vision_available():
    from PIL import Image

35
    from transformers import YolosImageProcessor
NielsRogge's avatar
NielsRogge committed
36
37


38
class YolosImageProcessingTester(unittest.TestCase):
NielsRogge's avatar
NielsRogge committed
39
40
41
42
43
44
45
46
    def __init__(
        self,
        parent,
        batch_size=7,
        num_channels=3,
        min_resolution=30,
        max_resolution=400,
        do_resize=True,
47
        size=None,
NielsRogge's avatar
NielsRogge committed
48
49
50
        do_normalize=True,
        image_mean=[0.5, 0.5, 0.5],
        image_std=[0.5, 0.5, 0.5],
51
52
53
        do_rescale=True,
        rescale_factor=1 / 255,
        do_pad=True,
NielsRogge's avatar
NielsRogge committed
54
    ):
55
56
        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
NielsRogge's avatar
NielsRogge committed
57
58
59
60
61
62
63
64
65
66
        self.parent = parent
        self.batch_size = batch_size
        self.num_channels = num_channels
        self.min_resolution = min_resolution
        self.max_resolution = max_resolution
        self.do_resize = do_resize
        self.size = size
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
67
68
69
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_pad = do_pad
NielsRogge's avatar
NielsRogge committed
70

71
    def prepare_image_processor_dict(self):
NielsRogge's avatar
NielsRogge committed
72
73
74
75
76
77
        return {
            "do_resize": self.do_resize,
            "size": self.size,
            "do_normalize": self.do_normalize,
            "image_mean": self.image_mean,
            "image_std": self.image_std,
78
79
80
            "do_rescale": self.do_rescale,
            "rescale_factor": self.rescale_factor,
            "do_pad": self.do_pad,
NielsRogge's avatar
NielsRogge committed
81
82
83
84
        }

    def get_expected_values(self, image_inputs, batched=False):
        """
85
        This function computes the expected height and width when providing images to YolosImageProcessor,
NielsRogge's avatar
NielsRogge committed
86
87
88
89
90
        assuming do_resize is set to True with a scalar size.
        """
        if not batched:
            image = image_inputs[0]
            if isinstance(image, Image.Image):
amyeroberts's avatar
amyeroberts committed
91
                width, height = image.size
NielsRogge's avatar
NielsRogge committed
92
            else:
amyeroberts's avatar
amyeroberts committed
93
94
95
96
97
98
99
100
101
102
                height, width = image.shape[1], image.shape[2]

            size = self.size["shortest_edge"]
            max_size = self.size.get("longest_edge", None)
            if max_size is not None:
                min_original_size = float(min((height, width)))
                max_original_size = float(max((height, width)))
                if max_original_size / min_original_size * size > max_size:
                    size = int(round(max_size * min_original_size / max_original_size))

103
            if width <= height and width != size:
amyeroberts's avatar
amyeroberts committed
104
105
106
107
108
109
110
111
112
                height = int(size * height / width)
                width = size
            elif height < width and height != size:
                width = int(size * width / height)
                height = size
            width_mod = width % 16
            height_mod = height % 16
            expected_width = width - width_mod
            expected_height = height - height_mod
NielsRogge's avatar
NielsRogge committed
113
114
115
116
117
118
119
120
121
122
123

        else:
            expected_values = []
            for image in image_inputs:
                expected_height, expected_width = self.get_expected_values([image])
                expected_values.append((expected_height, expected_width))
            expected_height = max(expected_values, key=lambda item: item[0])[0]
            expected_width = max(expected_values, key=lambda item: item[1])[1]

        return expected_height, expected_width

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    def expected_output_image_shape(self, images):
        height, width = self.get_expected_values(images, batched=True)
        return self.num_channels, height, width

    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
        return prepare_image_inputs(
            batch_size=self.batch_size,
            num_channels=self.num_channels,
            min_resolution=self.min_resolution,
            max_resolution=self.max_resolution,
            equal_resolution=equal_resolution,
            numpify=numpify,
            torchify=torchify,
        )

NielsRogge's avatar
NielsRogge committed
139
140
141

@require_torch
@require_vision
142
class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
143
    image_processing_class = YolosImageProcessor if is_vision_available() else None
NielsRogge's avatar
NielsRogge committed
144
145

    def setUp(self):
amyeroberts's avatar
amyeroberts committed
146
        super().setUp()
147
        self.image_processor_tester = YolosImageProcessingTester(self)
NielsRogge's avatar
NielsRogge committed
148
149

    @property
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
    def image_processor_dict(self):
        return self.image_processor_tester.prepare_image_processor_dict()

    def test_image_processor_properties(self):
        image_processing = self.image_processing_class(**self.image_processor_dict)
        self.assertTrue(hasattr(image_processing, "image_mean"))
        self.assertTrue(hasattr(image_processing, "image_std"))
        self.assertTrue(hasattr(image_processing, "do_normalize"))
        self.assertTrue(hasattr(image_processing, "do_resize"))
        self.assertTrue(hasattr(image_processing, "size"))

    def test_image_processor_from_dict_with_kwargs(self):
        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
        self.assertEqual(image_processor.do_pad, True)

        image_processor = self.image_processing_class.from_dict(
            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
168
        )
169
170
        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
        self.assertEqual(image_processor.do_pad, False)
171

NielsRogge's avatar
NielsRogge committed
172
    def test_equivalence_padding(self):
173
174
175
        # Initialize image_processings
        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
NielsRogge's avatar
NielsRogge committed
176
        # create random PyTorch tensors
177
        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
NielsRogge's avatar
NielsRogge committed
178
179
180
        for image in image_inputs:
            self.assertIsInstance(image, torch.Tensor)

181
182
183
        # Test whether the method "pad" and calling the image processor return the same tensors
        encoded_images_with_method = image_processing_1.pad(image_inputs, return_tensors="pt")
        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
NielsRogge's avatar
NielsRogge committed
184

185
186
187
        self.assertTrue(
            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
        )
NielsRogge's avatar
NielsRogge committed
188

189
190
191
192
193
194
195
196
197
198
199
    @parameterized.expand(
        [
            ((3, 100, 1500), 1333, 800),
            ((3, 400, 400), 1333, 800),
            ((3, 1500, 1500), 1333, 800),
            ((3, 800, 1333), 1333, 800),
            ((3, 1333, 800), 1333, 800),
            ((3, 800, 800), 400, 400),
        ]
    )
    def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
amyeroberts's avatar
amyeroberts committed
200
201
202
        image_processor = self.image_processing_class(**self.image_processor_dict)

        # create torch tensors as image
203
        image = torch.randint(0, 256, image_size, dtype=torch.uint8)
amyeroberts's avatar
amyeroberts committed
204
        processed_image = image_processor(
205
206
207
208
            image,
            size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
            do_pad=False,
            return_tensors="pt",
amyeroberts's avatar
amyeroberts committed
209
210
        )["pixel_values"]

211
212
213
214
        shape = list(processed_image.shape[-2:])
        max_size, min_size = max(shape), min(shape)
        self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
        self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")
amyeroberts's avatar
amyeroberts committed
215

NielsRogge's avatar
NielsRogge committed
216
217
218
219
220
221
222
223
224
225
    @slow
    def test_call_pytorch_with_coco_detection_annotations(self):
        # prepare image and target
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())

        target = {"image_id": 39769, "annotations": target}

        # encode them
226
227
        image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
NielsRogge's avatar
NielsRogge committed
228
229

        # verify pixel values
230
        expected_shape = torch.Size([1, 3, 800, 1056])
NielsRogge's avatar
NielsRogge committed
231
232
233
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
234
        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
NielsRogge's avatar
NielsRogge committed
235
236

        # verify area
237
        expected_area = torch.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
238
        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
NielsRogge's avatar
NielsRogge committed
239
240
241
242
        # verify boxes
        expected_boxes_shape = torch.Size([6, 4])
        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
243
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
NielsRogge's avatar
NielsRogge committed
244
245
        # verify image_id
        expected_image_id = torch.tensor([39769])
246
        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
NielsRogge's avatar
NielsRogge committed
247
248
        # verify is_crowd
        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
249
        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
NielsRogge's avatar
NielsRogge committed
250
251
        # verify class_labels
        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
252
        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
NielsRogge's avatar
NielsRogge committed
253
254
        # verify orig_size
        expected_orig_size = torch.tensor([480, 640])
255
        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
NielsRogge's avatar
NielsRogge committed
256
        # verify size
257
        expected_size = torch.tensor([800, 1056])
258
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
NielsRogge's avatar
NielsRogge committed
259
260
261
262
263
264
265
266
267
268
269
270
271

    @slow
    def test_call_pytorch_with_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())

        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

        # encode them
272
273
        image_processing = YolosImageProcessor(format="coco_panoptic")
        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
NielsRogge's avatar
NielsRogge committed
274
275

        # verify pixel values
276
        expected_shape = torch.Size([1, 3, 800, 1056])
NielsRogge's avatar
NielsRogge committed
277
278
279
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
280
        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
NielsRogge's avatar
NielsRogge committed
281
282

        # verify area
283
        expected_area = torch.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
284
        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
NielsRogge's avatar
NielsRogge committed
285
286
287
288
        # verify boxes
        expected_boxes_shape = torch.Size([6, 4])
        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
289
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
NielsRogge's avatar
NielsRogge committed
290
291
        # verify image_id
        expected_image_id = torch.tensor([39769])
292
        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
NielsRogge's avatar
NielsRogge committed
293
294
        # verify is_crowd
        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
295
        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
NielsRogge's avatar
NielsRogge committed
296
297
        # verify class_labels
        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
298
        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
NielsRogge's avatar
NielsRogge committed
299
        # verify masks
300
        expected_masks_sum = 815161
NielsRogge's avatar
NielsRogge committed
301
302
303
        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
        # verify orig_size
        expected_orig_size = torch.tensor([480, 640])
304
        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
NielsRogge's avatar
NielsRogge committed
305
        # verify size
306
        expected_size = torch.tensor([800, 1056])
307
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
308

309
    # Output size is slight different from DETR as yolos takes mod of 16
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    @slow
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))

        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())

        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}

        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox

        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]

        image_processing = YolosImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )

        # Check the pixel values have been padded
346
        postprocessed_height, postprocessed_width = 800, 1056
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
365
366
367
368
369
370
                [0.4169, 0.2765, 0.0458, 0.2215],
                [0.1284, 0.2016, 0.1576, 0.0940],
                [0.3792, 0.4933, 0.7559, 0.9865],
                [0.3794, 0.5002, 0.7563, 0.9955],
                [0.1990, 0.5456, 0.3566, 0.8646],
                [0.5845, 0.4115, 0.3462, 0.7161],
371
372
            ]
        )
373
374
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3))
375
376

        # Check the masks have also been padded
377
378
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424

        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
425
426
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1))
427

428
    # Output size is slight different from DETR as yolos takes mod of 16
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))

        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())

        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}

        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]

        # encode them
        image_processing = YolosImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )

        # Check the pixel values have been padded
468
        postprocessed_height, postprocessed_width = 800, 1056
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
487
488
489
490
491
492
                [0.1591, 0.3262, 0.2841, 0.5175],
                [0.4678, 0.2463, 0.2746, 0.4275],
                [0.3030, 0.2956, 0.6042, 0.5913],
                [0.1023, 0.1200, 0.1250, 0.0550],
                [0.3329, 0.1656, 0.0350, 0.1312],
                [0.3026, 0.2994, 0.6051, 0.5987],
493
494
            ]
        )
495
496
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3))
497
498

        # Check the masks have also been padded
499
500
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549

        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596

    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
    def test_max_width_max_height_resizing_and_pad_strategy(self):
        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)

        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
        image_processor = YolosImageProcessor(
            size={"max_height": 100, "max_width": 100},
            do_pad=False,
        )
        inputs = image_processor(images=[image_1], return_tensors="pt")
        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))

        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
        image_processor = YolosImageProcessor(
            size={"max_height": 300, "max_width": 100},
            do_pad=False,
        )
        inputs = image_processor(images=[image_1], return_tensors="pt")

        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
        image_processor = YolosImageProcessor(
            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
        )
        inputs = image_processor(images=[image_1], return_tensors="pt")
        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))

        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
        image_processor = YolosImageProcessor(
            size={"max_height": 300, "max_width": 100},
            do_pad=True,
            pad_size={"height": 301, "width": 101},
        )
        inputs = image_processor(images=[image_1], return_tensors="pt")
        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))

        ### Check for batch
        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)

        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
        image_processor = YolosImageProcessor(
            size={"max_height": 150, "max_width": 100},
            do_pad=True,
            pad_size={"height": 150, "width": 100},
        )
        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))