Add DETR (#11653)

* Squash all commits of modeling_detr_v7 branch into one * Improve docs * Fix tests * Style * Improve docs some more and fix most tests * Fix slow tests of ViT, DeiT and DETR * Improve replacement of batch norm * Restructure timm backbone forward * Make DetrForSegmentation support any timm backbone * Fix name of output * Address most comments by @LysandreJik * Give better names for variables * Conditional imports + timm in setup.py * Address additional comments by @sgugger * Make style, add require_timm and require_vision to testsé * Remove train_backbone attribute of DetrConfig, add methods to freeze/unfreeze backbone * Add png files to fixtures * Fix type hint * Add timm to workflows * Add `BatchNorm2d` to the weight initialization * Fix retain_grad test * Replace model checkpoints by Facebook namespace * Fix name of checkpoint in test * Add user-friendly message when scipy is not available * Address most comments by @patrickvonplaten * Remove return_intermediate_layers attribute of DetrConfig and simplify Joiner * Better initialization * Scipy is necessary to get sklearn metrics * Rename TimmBackbone to DetrTimmConvEncoder and rename DetrJoiner to DetrConvModel * Make style * Improve docs and add 2 community notebooks Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>

Add DETR (#11653)
* Squash all commits of modeling_detr_v7 branch into one * Improve docs * Fix tests * Style * Improve docs some more and fix most tests * Fix slow tests of ViT, DeiT and DETR * Improve replacement of batch norm * Restructure timm backbone forward * Make DetrForSegmentation support any timm backbone * Fix name of output * Address most comments by @LysandreJik * Give better names for variables * Conditional imports + timm in setup.py * Address additional comments by @sgugger * Make style, add require_timm and require_vision to testsé * Remove train_backbone attribute of DetrConfig, add methods to freeze/unfreeze backbone * Add png files to fixtures * Fix type hint * Add timm to workflows * Add `BatchNorm2d` to the weight initialization * Fix retain_grad test * Replace model checkpoints by Facebook namespace * Fix name of checkpoint in test * Add user-friendly message when scipy is not available * Address most comments by @patrickvonplaten * Remove return_intermediate_layers attribute of DetrConfig and simplify Joiner * Better initialization * Scipy is necessary to get sklearn metrics * Rename TimmBackbone to DetrTimmConvEncoder and rename DetrJoiner to DetrConvModel * Make style * Improve docs and add 2 community notebooks Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
d3eacbb8 · NielsRogge · GitHub · d14e0af2 · d3eacbb8 · d3eacbb8
Unverified Commit d3eacbb8 authored Jun 09, 2021 by NielsRogge Committed by GitHub Jun 09, 2021
20 changed files
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -39,6 +39,7 @@ from .file_utils import (
    is_sentencepiece_available,
    is_soundfile_availble,
    is_tf_available,
+    is_timm_available,
    is_tokenizers_available,
    is_torch_available,
    is_torch_tpu_available,
@@ -229,6 +230,19 @@ def require_onnx(test_case):
        return test_case
+def require_timm(test_case):
+    """
+    Decorator marking a test that requires Timm.
+    These tests are skipped when Timm isn't installed.
+    """
+    if not is_timm_available():
+        return unittest.skip("test requires Timm")(test_case)
+    else:
+        return test_case
 def require_torch(test_case):
    """
    Decorator marking a test that requires PyTorch.

--- a/src/transformers/utils/coco_classes.py
+++ b/src/transformers/utils/coco_classes.py
+# COCO object detection id's to class names
+id2label = {
+    0: "N/A",
+    1: "person",
+    2: "bicycle",
+    3: "car",
+    4: "motorcycle",
+    5: "airplane",
+    6: "bus",
+    7: "train",
+    8: "truck",
+    9: "boat",
+    10: "traffic light",
+    11: "fire hydrant",
+    12: "N/A",
+    13: "stop sign",
+    14: "parking meter",
+    15: "bench",
+    16: "bird",
+    17: "cat",
+    18: "dog",
+    19: "horse",
+    20: "sheep",
+    21: "cow",
+    22: "elephant",
+    23: "bear",
+    24: "zebra",
+    25: "giraffe",
+    26: "N/A",
+    27: "backpack",
+    28: "umbrella",
+    29: "N/A",
+    30: "N/A",
+    31: "handbag",
+    32: "tie",
+    33: "suitcase",
+    34: "frisbee",
+    35: "skis",
+    36: "snowboard",
+    37: "sports ball",
+    38: "kite",
+    39: "baseball bat",
+    40: "baseball glove",
+    41: "skateboard",
+    42: "surfboard",
+    43: "tennis racket",
+    44: "bottle",
+    45: "N/A",
+    46: "wine glass",
+    47: "cup",
+    48: "fork",
+    49: "knife",
+    50: "spoon",
+    51: "bowl",
+    52: "banana",
+    53: "apple",
+    54: "sandwich",
+    55: "orange",
+    56: "broccoli",
+    57: "carrot",
+    58: "hot dog",
+    59: "pizza",
+    60: "donut",
+    61: "cake",
+    62: "chair",
+    63: "couch",
+    64: "potted plant",
+    65: "bed",
+    66: "N/A",
+    67: "dining table",
+    68: "N/A",
+    69: "N/A",
+    70: "toilet",
+    71: "N/A",
+    72: "tv",
+    73: "laptop",
+    74: "mouse",
+    75: "remote",
+    76: "keyboard",
+    77: "cell phone",
+    78: "microwave",
+    79: "oven",
+    80: "toaster",
+    81: "sink",
+    82: "refrigerator",
+    83: "N/A",
+    84: "book",
+    85: "clock",
+    86: "vase",
+    87: "scissors",
+    88: "teddy bear",
+    89: "hair drier",
+    90: "toothbrush",
+}
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -334,6 +334,9 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = None
 MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = None
+MODEL_FOR_OBJECT_DETECTION_MAPPING = None
 MODEL_FOR_PRETRAINING_MAPPING = None

--- a/src/transformers/utils/dummy_timm_and_vision_objects.py
+++ b/src/transformers/utils/dummy_timm_and_vision_objects.py
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class DetrForObjectDetection:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+class DetrForSegmentation:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+class DetrModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["timm", "vision"])
--- a/src/transformers/utils/dummy_timm_objects.py
+++ b/src/transformers/utils/dummy_timm_objects.py
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class DetrForObjectDetection:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
+class DetrForSegmentation:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
+class DetrModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["timm"])
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -22,6 +22,11 @@ class DeiTFeatureExtractor:
        requires_backends(self, ["vision"])
+class DetrFeatureExtractor:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
 class ViTFeatureExtractor:
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["vision"])
--- a/tests/fixtures/coco.jpg
+++ b/tests/fixtures/coco.jpg
--- a/tests/fixtures/tests_samples/.gitignore
+++ b/tests/fixtures/tests_samples/.gitignore
-*.*
 cache*
 temp*
 !*.txt

--- a/tests/fixtures/tests_samples/COCO/cats.png
+++ b/tests/fixtures/tests_samples/COCO/cats.png
--- a/tests/fixtures/tests_samples/COCO/coco_annotations.txt
+++ b/tests/fixtures/tests_samples/COCO/coco_annotations.txt
+[{"segmentation": [[333.96, 175.14, 338.26, 134.33, 342.55, 95.67, 348.99, 79.57, 368.32, 80.64, 371.54, 91.38, 364.03, 106.41, 356.51, 145.07, 351.14, 166.55, 350.07, 184.8, 345.77, 185.88, 332.89, 178.36, 332.89, 172.99]], "area": 2120.991099999999, "iscrowd": 0, "image_id": 39769, "bbox": [332.89, 79.57, 38.65, 106.31], "category_id": 75, "id": 1108446}, {"segmentation": [[44.03, 86.01, 112.75, 74.2, 173.96, 77.42, 175.03, 89.23, 170.74, 98.9, 147.11, 102.12, 54.77, 119.3, 53.69, 119.3, 44.03, 113.93, 41.88, 94.6, 41.88, 94.6]], "area": 4052.607, "iscrowd": 0, "image_id": 39769, "bbox": [41.88, 74.2, 133.15, 45.1], "category_id": 75, "id": 1110067}, {"segmentation": [[1.08, 473.53, 633.17, 473.53, 557.66, 376.45, 535.01, 366.74, 489.71, 305.26, 470.29, 318.2, 456.27, 351.64, 413.12, 363.51, 376.45, 358.11, 348.4, 350.56, 363.51, 331.15, 357.03, 288.0, 353.8, 257.8, 344.09, 190.92, 333.3, 177.98, 345.17, 79.82, 284.76, 130.52, 265.35, 151.01, 308.49, 189.84, 317.12, 215.73, 293.39, 243.78, 269.66, 212.49, 235.15, 199.55, 214.65, 193.08, 187.69, 217.89, 159.64, 278.29, 135.91, 313.89, 169.35, 292.31, 203.87, 281.53, 220.04, 292.31, 220.04, 307.42, 175.82, 345.17, 155.33, 360.27, 105.71, 363.51, 85.21, 374.29, 74.43, 366.74, 70.11, 465.98, 42.07, 471.37, 33.44, 457.35, 34.52, 414.2, 29.12, 368.9, 9.71, 291.24, 46.38, 209.26, 99.24, 128.36, 131.6, 107.87, 50.7, 117.57, 40.99, 103.55, 40.99, 85.21, 60.4, 77.66, 141.3, 70.11, 173.66, 72.27, 174.74, 92.76, 204.94, 72.27, 225.44, 62.56, 262.11, 56.09, 292.31, 53.93, 282.61, 81.98, 298.79, 96.0, 310.65, 102.47, 348.4, 74.43, 373.21, 81.98, 430.38, 35.6, 484.31, 23.73, 540.4, 46.38, 593.26, 66.88, 638.56, 80.9, 632.09, 145.62, 581.39, 118.65, 543.64, 130.52, 533.93, 167.19, 512.36, 197.39, 498.34, 218.97, 529.62, 253.48, 549.03, 273.98, 584.63, 276.13, 587.87, 293.39, 566.29, 305.26, 531.78, 298.79, 549.03, 319.28, 576.0, 358.11, 560.9, 376.45, 639.64, 471.37, 639.64, 2.16, 1.08, 0.0]], "area": 176277.55269999994, "iscrowd": 0, "image_id": 39769, "bbox": [1.08, 0.0, 638.56, 473.53], "category_id": 63, "id": 1605237}, {"segmentation": [[1.07, 1.18, 640.0, 3.33, 638.93, 472.59, 4.3, 479.03]], "area": 301552.6694999999, "iscrowd": 0, "image_id": 39769, "bbox": [1.07, 1.18, 638.93, 477.85], "category_id": 65, "id": 1612051}, {"segmentation": [[138.75, 319.38, 148.75, 294.38, 165.0, 246.87, 197.5, 205.63, 247.5, 203.13, 268.75, 216.88, 280.0, 239.38, 293.75, 244.38, 303.75, 241.88, 307.5, 228.13, 318.75, 220.63, 315.0, 200.63, 291.25, 171.88, 265.0, 156.88, 258.75, 148.13, 262.5, 135.63, 282.5, 123.13, 292.5, 115.63, 311.25, 108.13, 313.75, 106.88, 296.25, 93.13, 282.5, 84.38, 292.5, 64.38, 288.75, 60.63, 266.25, 54.38, 232.5, 63.12, 206.25, 70.63, 170.0, 100.63, 136.25, 114.38, 101.25, 138.13, 56.25, 194.38, 27.5, 259.38, 17.5, 299.38, 32.5, 378.13, 31.25, 448.13, 41.25, 469.38, 66.25, 466.88, 70.0, 419.38, 71.25, 391.88, 77.5, 365.63, 113.75, 364.38, 145.0, 360.63, 168.75, 349.38, 191.25, 330.63, 212.5, 319.38, 223.75, 305.63, 206.25, 286.88, 172.5, 288.13]], "area": 53301.618749999994, "iscrowd": 0, "image_id": 39769, "bbox": [17.5, 54.38, 301.25, 415.0], "category_id": 17, "id": 2190839}, {"segmentation": [[543.75, 136.88, 570.0, 114.38, 591.25, 123.13, 616.25, 140.63, 640.0, 143.13, 636.25, 124.37, 605.0, 103.13, 640.0, 103.13, 633.75, 86.88, 587.5, 73.13, 548.75, 49.38, 505.0, 35.63, 462.5, 25.63, 405.0, 48.13, 362.5, 111.88, 347.5, 179.38, 355.0, 220.63, 356.25, 230.63, 365.0, 264.38, 358.75, 266.88, 358.75, 270.63, 356.25, 291.88, 356.25, 325.63, 355.0, 338.13, 350.0, 348.13, 365.0, 354.38, 396.25, 351.88, 423.75, 355.63, 446.25, 350.63, 460.0, 345.63, 462.5, 321.88, 468.75, 306.88, 481.25, 299.38, 516.25, 341.88, 536.25, 368.13, 570.0, 369.38, 578.75, 359.38, 555.0, 330.63, 532.5, 298.13, 563.75, 299.38, 582.5, 298.13, 586.25, 286.88, 578.75, 278.13, 548.75, 269.38, 525.0, 256.88, 505.0, 206.88, 536.25, 161.88, 540.0, 149.38]], "area": 59700.95625, "iscrowd": 0, "image_id": 39769, "bbox": [347.5, 25.63, 292.5, 343.75], "category_id": 17, "id": 2190842}]
\ No newline at end of file
--- a/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png
+++ b/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png
--- a/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
+++ b/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
+[{"id": 8222595, "category_id": 17, "iscrowd": 0, "bbox": [18, 54, 301, 415], "area": 53306}, {"id": 8225432, "category_id": 17, "iscrowd": 0, "bbox": [349, 26, 291, 343], "area": 59627}, {"id": 8798150, "category_id": 63, "iscrowd": 0, "bbox": [1, 0, 639, 474], "area": 174579}, {"id": 14466198, "category_id": 75, "iscrowd": 0, "bbox": [42, 74, 133, 45], "area": 4068}, {"id": 12821912, "category_id": 75, "iscrowd": 0, "bbox": [333, 80, 38, 106], "area": 2118}, {"id": 10898909, "category_id": 93, "iscrowd": 0, "bbox": [0, 0, 640, 480], "area": 2750}]
\ No newline at end of file
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -18,6 +18,57 @@ import json
 import os
 import tempfile
+from transformers.file_utils import is_torch_available, is_vision_available
+if is_torch_available():
+    import numpy as np
+    import torch
+if is_vision_available():
+    from PIL import Image
+def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+    or a list of PyTorch tensors if one specifies torchify=True.
+    """
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+    if equal_resolution:
+        image_inputs = []
+        for i in range(feature_extract_tester.batch_size):
+            image_inputs.append(
+                np.random.randint(
+                    255,
+                    size=(
+                        feature_extract_tester.num_channels,
+                        feature_extract_tester.max_resolution,
+                        feature_extract_tester.max_resolution,
+                    ),
+                    dtype=np.uint8,
+                )
+            )
+    else:
+        image_inputs = []
+        for i in range(feature_extract_tester.batch_size):
+            width, height = np.random.choice(
+                np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
+            )
+            image_inputs.append(
+                np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
+            )
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+    if torchify:
+        image_inputs = [torch.from_numpy(x) for x in image_inputs]
+    return image_inputs
 class FeatureExtractionSavingTestMixin:
    def test_feat_extract_to_json_string(self):

--- a/tests/test_feature_extraction_deit.py
+++ b/tests/test_feature_extraction_deit.py
@@ -21,7 +21,7 @@ import numpy as np
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
-from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 if is_torch_available():
@@ -75,36 +75,6 @@ class DeiTFeatureExtractionTester(unittest.TestCase):
            "image_std": self.image_std,
        }
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-        return image_inputs
 @require_torch
 @require_vision
@@ -136,7 +106,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
        # Initialize feature_extractor
        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
        for image in image_inputs:
            self.assertIsInstance(image, Image.Image)
@@ -168,7 +138,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
        # Initialize feature_extractor
        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
        # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
        for image in image_inputs:
            self.assertIsInstance(image, np.ndarray)
@@ -200,7 +170,7 @@ class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestC
        # Initialize feature_extractor
        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
        # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
        for image in image_inputs:
            self.assertIsInstance(image, torch.Tensor)

--- a/tests/test_feature_extraction_detr.py
+++ b/tests/test_feature_extraction_detr.py
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -21,7 +21,7 @@ import numpy as np
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
-from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 if is_torch_available():
@@ -69,36 +69,6 @@ class ViTFeatureExtractionTester(unittest.TestCase):
            "size": self.size,
        }
-    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-        if equal_resolution:
-            image_inputs = []
-            for i in range(self.batch_size):
-                image_inputs.append(
-                    np.random.randint(
-                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
-                    )
-                )
-        else:
-            image_inputs = []
-            for i in range(self.batch_size):
-                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
-                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
-        if not numpify and not torchify:
-            # PIL expects the channel dimension as last dimension
-            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-        if torchify:
-            image_inputs = [torch.from_numpy(x) for x in image_inputs]
-        return image_inputs
 @require_torch
 @require_vision
@@ -128,7 +98,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
        # Initialize feature_extractor
        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
        for image in image_inputs:
            self.assertIsInstance(image, Image.Image)
@@ -160,7 +130,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
        # Initialize feature_extractor
        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
        # create random numpy tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
        for image in image_inputs:
            self.assertIsInstance(image, np.ndarray)
@@ -192,7 +162,7 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
        # Initialize feature_extractor
        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
        # create random PyTorch tensors
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
        for image in image_inputs:
            self.assertIsInstance(image, torch.Tensor)

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -21,7 +21,7 @@ import random
 import tempfile
 import unittest
 import warnings
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 from huggingface_hub import HfApi
 from requests.exceptions import HTTPError
@@ -982,7 +982,6 @@ class ModelTesterMixin:
        outputs = model(**inputs)
-        print(outputs)
        output = outputs[0]
        if config.is_encoder_decoder:
@@ -1236,6 +1235,11 @@ class ModelTesterMixin:
                    if isinstance(tuple_object, (List, Tuple)):
                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
                    elif tuple_object is None:
                        return
                    else:

--- a/tests/test_modeling_deit.py
+++ b/tests/test_modeling_deit.py
@@ -360,7 +360,7 @@ class DeiTModelTest(ModelTesterMixin, unittest.TestCase):
 # We will verify our results on an image of cute cats
 def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/cats.png")
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
    return image

--- a/tests/test_modeling_detr.py
+++ b/tests/test_modeling_detr.py
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -322,7 +322,7 @@ class ViTModelTest(ModelTesterMixin, unittest.TestCase):
 # We will verify our results on an image of cute cats
 def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/cats.png")
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
    return image