test_image_processing_yolos.py 24.2 KB
Newer Older
NielsRogge's avatar
NielsRogge committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# coding=utf-8
# Copyright 2021 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import pathlib
import unittest

21
22
from parameterized import parameterized

NielsRogge's avatar
NielsRogge committed
23
24
25
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available

26
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
NielsRogge's avatar
NielsRogge committed
27
28
29
30
31
32
33
34


if is_torch_available():
    import torch

if is_vision_available():
    from PIL import Image

35
    from transformers import YolosImageProcessor
NielsRogge's avatar
NielsRogge committed
36
37


38
class YolosImageProcessingTester(unittest.TestCase):
NielsRogge's avatar
NielsRogge committed
39
40
41
42
43
44
45
46
    def __init__(
        self,
        parent,
        batch_size=7,
        num_channels=3,
        min_resolution=30,
        max_resolution=400,
        do_resize=True,
47
        size=None,
NielsRogge's avatar
NielsRogge committed
48
49
50
        do_normalize=True,
        image_mean=[0.5, 0.5, 0.5],
        image_std=[0.5, 0.5, 0.5],
51
52
53
        do_rescale=True,
        rescale_factor=1 / 255,
        do_pad=True,
NielsRogge's avatar
NielsRogge committed
54
    ):
55
56
        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
NielsRogge's avatar
NielsRogge committed
57
58
59
60
61
62
63
64
65
66
        self.parent = parent
        self.batch_size = batch_size
        self.num_channels = num_channels
        self.min_resolution = min_resolution
        self.max_resolution = max_resolution
        self.do_resize = do_resize
        self.size = size
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
67
68
69
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_pad = do_pad
NielsRogge's avatar
NielsRogge committed
70

71
    def prepare_image_processor_dict(self):
NielsRogge's avatar
NielsRogge committed
72
73
74
75
76
77
        return {
            "do_resize": self.do_resize,
            "size": self.size,
            "do_normalize": self.do_normalize,
            "image_mean": self.image_mean,
            "image_std": self.image_std,
78
79
80
            "do_rescale": self.do_rescale,
            "rescale_factor": self.rescale_factor,
            "do_pad": self.do_pad,
NielsRogge's avatar
NielsRogge committed
81
82
83
84
        }

    def get_expected_values(self, image_inputs, batched=False):
        """
85
        This function computes the expected height and width when providing images to YolosImageProcessor,
NielsRogge's avatar
NielsRogge committed
86
87
88
89
90
        assuming do_resize is set to True with a scalar size.
        """
        if not batched:
            image = image_inputs[0]
            if isinstance(image, Image.Image):
amyeroberts's avatar
amyeroberts committed
91
                width, height = image.size
NielsRogge's avatar
NielsRogge committed
92
            else:
amyeroberts's avatar
amyeroberts committed
93
94
95
96
97
98
99
100
101
102
                height, width = image.shape[1], image.shape[2]

            size = self.size["shortest_edge"]
            max_size = self.size.get("longest_edge", None)
            if max_size is not None:
                min_original_size = float(min((height, width)))
                max_original_size = float(max((height, width)))
                if max_original_size / min_original_size * size > max_size:
                    size = int(round(max_size * min_original_size / max_original_size))

103
            if width <= height and width != size:
amyeroberts's avatar
amyeroberts committed
104
105
106
107
108
109
110
111
112
                height = int(size * height / width)
                width = size
            elif height < width and height != size:
                width = int(size * width / height)
                height = size
            width_mod = width % 16
            height_mod = height % 16
            expected_width = width - width_mod
            expected_height = height - height_mod
NielsRogge's avatar
NielsRogge committed
113
114
115
116
117
118
119
120
121
122
123

        else:
            expected_values = []
            for image in image_inputs:
                expected_height, expected_width = self.get_expected_values([image])
                expected_values.append((expected_height, expected_width))
            expected_height = max(expected_values, key=lambda item: item[0])[0]
            expected_width = max(expected_values, key=lambda item: item[1])[1]

        return expected_height, expected_width

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    def expected_output_image_shape(self, images):
        height, width = self.get_expected_values(images, batched=True)
        return self.num_channels, height, width

    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
        return prepare_image_inputs(
            batch_size=self.batch_size,
            num_channels=self.num_channels,
            min_resolution=self.min_resolution,
            max_resolution=self.max_resolution,
            equal_resolution=equal_resolution,
            numpify=numpify,
            torchify=torchify,
        )

NielsRogge's avatar
NielsRogge committed
139
140
141

@require_torch
@require_vision
142
class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
143
    image_processing_class = YolosImageProcessor if is_vision_available() else None
NielsRogge's avatar
NielsRogge committed
144
145

    def setUp(self):
146
        self.image_processor_tester = YolosImageProcessingTester(self)
NielsRogge's avatar
NielsRogge committed
147
148

    @property
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    def image_processor_dict(self):
        return self.image_processor_tester.prepare_image_processor_dict()

    def test_image_processor_properties(self):
        image_processing = self.image_processing_class(**self.image_processor_dict)
        self.assertTrue(hasattr(image_processing, "image_mean"))
        self.assertTrue(hasattr(image_processing, "image_std"))
        self.assertTrue(hasattr(image_processing, "do_normalize"))
        self.assertTrue(hasattr(image_processing, "do_resize"))
        self.assertTrue(hasattr(image_processing, "size"))

    def test_image_processor_from_dict_with_kwargs(self):
        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
        self.assertEqual(image_processor.do_pad, True)

        image_processor = self.image_processing_class.from_dict(
            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
167
        )
168
169
        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
        self.assertEqual(image_processor.do_pad, False)
170

NielsRogge's avatar
NielsRogge committed
171
    def test_equivalence_padding(self):
172
173
174
        # Initialize image_processings
        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
NielsRogge's avatar
NielsRogge committed
175
        # create random PyTorch tensors
176
        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
NielsRogge's avatar
NielsRogge committed
177
178
179
        for image in image_inputs:
            self.assertIsInstance(image, torch.Tensor)

180
181
182
        # Test whether the method "pad" and calling the image processor return the same tensors
        encoded_images_with_method = image_processing_1.pad(image_inputs, return_tensors="pt")
        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
NielsRogge's avatar
NielsRogge committed
183

184
185
186
        self.assertTrue(
            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
        )
NielsRogge's avatar
NielsRogge committed
187

188
189
190
191
192
193
194
195
196
197
198
    @parameterized.expand(
        [
            ((3, 100, 1500), 1333, 800),
            ((3, 400, 400), 1333, 800),
            ((3, 1500, 1500), 1333, 800),
            ((3, 800, 1333), 1333, 800),
            ((3, 1333, 800), 1333, 800),
            ((3, 800, 800), 400, 400),
        ]
    )
    def test_resize_max_size_respected(self, image_size, longest_edge, shortest_edge):
amyeroberts's avatar
amyeroberts committed
199
200
201
        image_processor = self.image_processing_class(**self.image_processor_dict)

        # create torch tensors as image
202
        image = torch.randint(0, 256, image_size, dtype=torch.uint8)
amyeroberts's avatar
amyeroberts committed
203
        processed_image = image_processor(
204
205
206
207
            image,
            size={"longest_edge": longest_edge, "shortest_edge": shortest_edge},
            do_pad=False,
            return_tensors="pt",
amyeroberts's avatar
amyeroberts committed
208
209
        )["pixel_values"]

210
211
212
213
        shape = list(processed_image.shape[-2:])
        max_size, min_size = max(shape), min(shape)
        self.assertTrue(max_size <= 1333, f"Expected max_size <= 1333, got image shape {shape}")
        self.assertTrue(min_size <= 800, f"Expected min_size <= 800, got image shape {shape}")
amyeroberts's avatar
amyeroberts committed
214

NielsRogge's avatar
NielsRogge committed
215
216
217
218
219
220
221
222
223
224
    @slow
    def test_call_pytorch_with_coco_detection_annotations(self):
        # prepare image and target
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())

        target = {"image_id": 39769, "annotations": target}

        # encode them
225
226
        image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
NielsRogge's avatar
NielsRogge committed
227
228

        # verify pixel values
229
        expected_shape = torch.Size([1, 3, 800, 1056])
NielsRogge's avatar
NielsRogge committed
230
231
232
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
233
        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
NielsRogge's avatar
NielsRogge committed
234
235

        # verify area
236
        expected_area = torch.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
237
        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
NielsRogge's avatar
NielsRogge committed
238
239
240
241
        # verify boxes
        expected_boxes_shape = torch.Size([6, 4])
        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
242
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
NielsRogge's avatar
NielsRogge committed
243
244
        # verify image_id
        expected_image_id = torch.tensor([39769])
245
        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
NielsRogge's avatar
NielsRogge committed
246
247
        # verify is_crowd
        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
248
        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
NielsRogge's avatar
NielsRogge committed
249
250
        # verify class_labels
        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
251
        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
NielsRogge's avatar
NielsRogge committed
252
253
        # verify orig_size
        expected_orig_size = torch.tensor([480, 640])
254
        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
NielsRogge's avatar
NielsRogge committed
255
        # verify size
256
        expected_size = torch.tensor([800, 1056])
257
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
NielsRogge's avatar
NielsRogge committed
258
259
260
261
262
263
264
265
266
267
268
269
270

    @slow
    def test_call_pytorch_with_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())

        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

        # encode them
271
272
        image_processing = YolosImageProcessor(format="coco_panoptic")
        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
NielsRogge's avatar
NielsRogge committed
273
274

        # verify pixel values
275
        expected_shape = torch.Size([1, 3, 800, 1056])
NielsRogge's avatar
NielsRogge committed
276
277
278
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
279
        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
NielsRogge's avatar
NielsRogge committed
280
281

        # verify area
282
        expected_area = torch.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
283
        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
NielsRogge's avatar
NielsRogge committed
284
285
286
287
        # verify boxes
        expected_boxes_shape = torch.Size([6, 4])
        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
288
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
NielsRogge's avatar
NielsRogge committed
289
290
        # verify image_id
        expected_image_id = torch.tensor([39769])
291
        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
NielsRogge's avatar
NielsRogge committed
292
293
        # verify is_crowd
        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
294
        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
NielsRogge's avatar
NielsRogge committed
295
296
        # verify class_labels
        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
297
        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
NielsRogge's avatar
NielsRogge committed
298
        # verify masks
299
        expected_masks_sum = 815161
NielsRogge's avatar
NielsRogge committed
300
301
302
        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
        # verify orig_size
        expected_orig_size = torch.tensor([480, 640])
303
        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
NielsRogge's avatar
NielsRogge committed
304
        # verify size
305
        expected_size = torch.tensor([800, 1056])
306
        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
307

308
    # Output size is slight different from DETR as yolos takes mod of 16
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
    @slow
    def test_batched_coco_detection_annotations(self):
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))

        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
            target = json.loads(f.read())

        annotations_0 = {"image_id": 39769, "annotations": target}
        annotations_1 = {"image_id": 39769, "annotations": target}

        # Adjust the bounding boxes for the resized image
        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotations_1["annotations"])):
            coords = annotations_1["annotations"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotations_1["annotations"][i]["bbox"] = new_bbox

        images = [image_0, image_1]
        annotations = [annotations_0, annotations_1]

        image_processing = YolosImageProcessor()
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            return_tensors="pt",  # do_convert_annotations=True
        )

        # Check the pixel values have been padded
345
        postprocessed_height, postprocessed_width = 800, 1056
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.6879, 0.4609, 0.0755, 0.3691],
                [0.2118, 0.3359, 0.2601, 0.1566],
                [0.5011, 0.5000, 0.9979, 1.0000],
                [0.5010, 0.5020, 0.9979, 0.9959],
                [0.3284, 0.5944, 0.5884, 0.8112],
                [0.8394, 0.5445, 0.3213, 0.9110],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
364
365
366
367
368
369
                [0.4169, 0.2765, 0.0458, 0.2215],
                [0.1284, 0.2016, 0.1576, 0.0940],
                [0.3792, 0.4933, 0.7559, 0.9865],
                [0.3794, 0.5002, 0.7563, 0.9955],
                [0.1990, 0.5456, 0.3566, 0.8646],
                [0.5845, 0.4115, 0.3462, 0.7161],
370
371
            ]
        )
372
373
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3))
374
375

        # Check the masks have also been padded
376
377
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423

        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
424
425
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1))
426

427
    # Output size is slight different from DETR as yolos takes mod of 16
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
    def test_batched_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))

        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
            target = json.loads(f.read())

        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}

        w_0, h_0 = image_0.size
        w_1, h_1 = image_1.size
        for i in range(len(annotation_1["segments_info"])):
            coords = annotation_1["segments_info"][i]["bbox"]
            new_bbox = [
                coords[0] * w_1 / w_0,
                coords[1] * h_1 / h_0,
                coords[2] * w_1 / w_0,
                coords[3] * h_1 / h_0,
            ]
            annotation_1["segments_info"][i]["bbox"] = new_bbox

        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")

        images = [image_0, image_1]
        annotations = [annotation_0, annotation_1]

        # encode them
        image_processing = YolosImageProcessor(format="coco_panoptic")
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_tensors="pt",
            return_segmentation_masks=True,
        )

        # Check the pixel values have been padded
467
        postprocessed_height, postprocessed_width = 800, 1056
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
        self.assertEqual(encoding["pixel_values"].shape, expected_shape)

        # Check the bounding boxes have been adjusted for padded images
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        expected_boxes_0 = torch.tensor(
            [
                [0.2625, 0.5437, 0.4688, 0.8625],
                [0.7719, 0.4104, 0.4531, 0.7125],
                [0.5000, 0.4927, 0.9969, 0.9854],
                [0.1688, 0.2000, 0.2063, 0.0917],
                [0.5492, 0.2760, 0.0578, 0.2187],
                [0.4992, 0.4990, 0.9984, 0.9979],
            ]
        )
        expected_boxes_1 = torch.tensor(
            [
486
487
488
489
490
491
                [0.1591, 0.3262, 0.2841, 0.5175],
                [0.4678, 0.2463, 0.2746, 0.4275],
                [0.3030, 0.2956, 0.6042, 0.5913],
                [0.1023, 0.1200, 0.1250, 0.0550],
                [0.3329, 0.1656, 0.0350, 0.1312],
                [0.3026, 0.2994, 0.6051, 0.5987],
492
493
            ]
        )
494
495
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3))
496
497

        # Check the masks have also been padded
498
499
        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1056]))
        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1056]))
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548

        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
        # format and not in the range [0, 1]
        encoding = image_processing(
            images=images,
            annotations=annotations,
            masks_path=masks_path,
            return_segmentation_masks=True,
            do_convert_annotations=False,
            return_tensors="pt",
        )
        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
        # Convert to absolute coordinates
        unnormalized_boxes_0 = torch.vstack(
            [
                expected_boxes_0[:, 0] * postprocessed_width,
                expected_boxes_0[:, 1] * postprocessed_height,
                expected_boxes_0[:, 2] * postprocessed_width,
                expected_boxes_0[:, 3] * postprocessed_height,
            ]
        ).T
        unnormalized_boxes_1 = torch.vstack(
            [
                expected_boxes_1[:, 0] * postprocessed_width,
                expected_boxes_1[:, 1] * postprocessed_height,
                expected_boxes_1[:, 2] * postprocessed_width,
                expected_boxes_1[:, 3] * postprocessed_height,
            ]
        ).T
        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
        expected_boxes_0 = torch.vstack(
            [
                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
            ]
        ).T
        expected_boxes_1 = torch.vstack(
            [
                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
            ]
        ).T
        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))