Add Mask2Former (#20792)

* Adds Mask2Former to transformers Co-authored-by: Shivalika Singh <shivalikasingh95@gmail.com> Co-authored-by: Shivalika Singh <73357305+shivalikasingh95@users.noreply.github.com> Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Add Mask2Former (#20792)
* Adds Mask2Former to transformers Co-authored-by: Shivalika Singh <shivalikasingh95@gmail.com> Co-authored-by: Shivalika Singh <73357305+shivalikasingh95@users.noreply.github.com> Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2411f0e4 · Alara Dirik · GitHub · 9edf3758 · 2411f0e4 · 2411f0e4
Unverified Commit 2411f0e4 authored Jan 16, 2023 by Alara Dirik Committed by GitHub Jan 16, 2023
8 changed files
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -103,6 +103,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("m2m_100", "M2M100Model"),
        ("marian", "MarianModel"),
        ("markuplm", "MarkupLMModel"),
+        ("mask2former", "Mask2FormerModel"),
        ("maskformer", "MaskFormerModel"),
        ("maskformer-swin", "MaskFormerSwinModel"),
        ("mbart", "MBartModel"),
@@ -454,6 +455,7 @@ MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict(
    [
        # Model for Universal Segmentation mapping
        ("detr", "DetrForSegmentation"),
+        ("mask2former", "Mask2FormerForUniversalSegmentation"),
        ("maskformer", "MaskFormerForInstanceSegmentation"),
    ]
 )

--- a/src/transformers/models/mask2former/__init__.py
+++ b/src/transformers/models/mask2former/__init__.py
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_mask2former": [
+        "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Mask2FormerConfig",
+    ],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mask2former"] = [
+        "MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Mask2FormerForUniversalSegmentation",
+        "Mask2FormerModel",
+        "Mask2FormerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mask2former import (
+            MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Mask2FormerForUniversalSegmentation,
+            Mask2FormerModel,
+            Mask2FormerPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc.and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mask2Former model configuration"""
+import copy
+from typing import Dict, List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/mask2former-swin-small-coco-instance": (
+        "https://huggingface.co/facebook/mask2former-swin-small-coco-instance/blob/main/config.json"
+    )
+    # See all Mask2Former models at https://huggingface.co/models?filter=mask2former
+}
+
+logger = logging.get_logger(__name__)
+
+
+class Mask2FormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Mask2FormerModel`]. It is used to instantiate a
+    Mask2Former model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Mask2Former
+    [facebook/mask2former-swin-small-coco-instance](https://huggingface.co/facebook/mask2former-swin-small-coco-instance)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Currently, Mask2Former only supports the [Swin Transformer](swin) as backbone.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `SwinConfig()`):
+            The configuration of the backbone model. If unset, the configuration corresponding to
+            `swin-base-patch4-window12-384` will be used.
+        feature_size (`int`, *optional*, defaults to 256):
+            The features (channels) of the resulting feature maps.
+        mask_feature_size (`int`, *optional*, defaults to 256):
+            The masks' features size, this value will also be used to specify the Feature Pyramid Network features'
+            size.
+        hidden_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the encoder layers.
+        encoder_feedforward_dim (`int`, *optional*, defaults to 1024):
+            Dimension of feedforward network for deformable detr encoder used as part of pixel decoder.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of layers in the deformable detr encoder used as part of pixel decoder.
+        decoder_layers (`int`, *optional*, defaults to 10):
+            Number of layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder.
+        dim_feedforward (`int`, *optional*, defaults to 2048):
+            Feature dimension in feedforward network for transformer decoder.
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use pre-LayerNorm or not for transformer decoder.
+        enforce_input_projection (`bool`, *optional*, defaults to `False`):
+            Whether to add an input projection 1x1 convolution even if the input channels and hidden dim are identical
+            in the Transformer decoder.
+        common_stride (`int`, *optional*, defaults to 4):
+            Parameter used for determining number of FPN levels used as part of pixel decoder.
+        ignore_value (`int`, *optional*, defaults to 255):
+            Category id to be ignored during training.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of queries for the decoder.
+        no_object_weight (`int`, *optional*, defaults to 0.1):
+            The weight to apply to the null (no object) class.
+        class_weight (`int`, *optional*, defaults to 2.0):
+            The weight for the cross entropy loss.
+        mask_weight (`int`, *optional*, defaults to 5.0):
+            The weight for the mask loss.
+        dice_weight (`int`, *optional*, defaults to 5.0):
+            The weight for the dice loss.
+        train_num_points (`str` or `function`, *optional*, defaults to 12544):
+            Number of points used for sampling during loss calculation.
+        oversample_ratio (`float`, *optional*, defaults to 3.0):
+            Oversampling parameter used for calculating no. of sampled points
+        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
+            Ratio of points that are sampled via importance sampling.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float``, *optional*, defaults to 1.0):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        use_auxiliary_loss (`boolean``, *optional*, defaults to `True`):
+            If `True` [`Mask2FormerForUniversalSegmentationOutput`] will contain the auxiliary losses computed using
+            the logits from each decoder's stage.
+        feature_strides (`List[int]`, *optional*, defaults to `[4, 8, 16, 32]`):
+            Feature strides corresponding to features generated from backbone network.
+        output_auxiliary_logits (`bool`, *optional*):
+            Should the model output its `auxiliary_logits` or not.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Mask2FormerConfig, Mask2FormerModel
+
+    >>> # Initializing a Mask2Former facebook/mask2former-swin-small-coco-instance configuration
+    >>> configuration = Mask2FormerConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/mask2former-swin-small-coco-instance style configuration
+    >>> model = Mask2FormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+
+    """
+    model_type = "mask2former"
+    backbones_supported = ["swin"]
+    attribute_map = {"hidden_size": "hidden_dim"}
+
+    def __init__(
+        self,
+        backbone_config: Optional[Dict] = None,
+        feature_size: int = 256,
+        mask_feature_size: int = 256,
+        hidden_dim: int = 256,
+        encoder_feedforward_dim: int = 1024,
+        activation_function: str = "relu",
+        encoder_layers: int = 6,
+        decoder_layers: int = 10,
+        num_attention_heads: int = 8,
+        dropout: float = 0.0,
+        dim_feedforward: int = 2048,
+        pre_norm: bool = False,
+        enforce_input_projection: bool = False,
+        common_stride: int = 4,
+        ignore_value: int = 255,
+        num_queries: int = 100,
+        no_object_weight: float = 0.1,
+        class_weight: float = 2.0,
+        mask_weight: float = 5.0,
+        dice_weight: float = 5.0,
+        train_num_points: int = 12544,
+        oversample_ratio: float = 3.0,
+        importance_sample_ratio: float = 0.75,
+        init_std: float = 0.02,
+        init_xavier_std: float = 1.0,
+        use_auxiliary_loss: bool = True,
+        feature_strides: List[int] = [4, 8, 16, 32],
+        output_auxiliary_logits: bool = None,
+        **kwargs,
+    ):
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
+            backbone_config = CONFIG_MAPPING["swin"](
+                image_size=224,
+                in_channels=3,
+                patch_size=4,
+                embed_dim=96,
+                depths=[2, 2, 18, 2],
+                num_heads=[3, 6, 12, 24],
+                window_size=7,
+                drop_path_rate=0.3,
+                use_absolute_embeddings=False,
+                out_features=["stage1", "stage2", "stage3", "stage4"],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        self.backbone_config = backbone_config
+        self.feature_size = feature_size
+        self.mask_feature_size = mask_feature_size
+        self.hidden_dim = hidden_dim
+        self.encoder_feedforward_dim = encoder_feedforward_dim
+        self.activation_function = activation_function
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.dim_feedforward = dim_feedforward
+        self.pre_norm = pre_norm
+        self.enforce_input_projection = enforce_input_projection
+        self.common_stride = common_stride
+        self.ignore_value = ignore_value
+        self.num_queries = num_queries
+        self.no_object_weight = no_object_weight
+        self.class_weight = class_weight
+        self.mask_weight = mask_weight
+        self.dice_weight = dice_weight
+        self.train_num_points = train_num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.use_auxiliary_loss = use_auxiliary_loss
+        self.feature_strides = feature_strides
+        self.output_auxiliary_logits = output_auxiliary_logits
+        self.num_hidden_layers = decoder_layers
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`Mask2FormerConfig`] (or a derived class) from a pre-trained backbone model configuration.
+
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+
+        Returns:
+            [`Mask2FormerConfig`]: An instance of a configuration object
+        """
+        return cls(
+            backbone_config=backbone_config,
+            **kwargs,
+        )
+
+    def to_dict(self) -> Dict[str, any]:
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3568,6 +3568,30 @@ class MarkupLMPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])


+MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Mask2FormerForUniversalSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Mask2FormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Mask2FormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None



--- a/tests/models/mask2former/__init__.py
+++ b/tests/models/mask2former/__init__.py
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Mask2Former model. """
+
+import inspect
+import unittest
+
+import numpy as np
+
+from tests.test_modeling_common import floats_tensor
+from transformers import Mask2FormerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import Mask2FormerForUniversalSegmentation, Mask2FormerModel
+
+    if is_vision_available():
+        from transformers import MaskFormerImageProcessor
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Mask2FormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        is_training=True,
+        use_auxiliary_loss=False,
+        num_queries=10,
+        num_channels=3,
+        min_size=32 * 8,
+        max_size=32 * 8,
+        num_labels=4,
+        hidden_dim=64,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_auxiliary_loss = use_auxiliary_loss
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.num_labels = num_labels
+        self.hidden_dim = hidden_dim
+        self.mask_feature_size = hidden_dim
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]).to(
+            torch_device
+        )
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        mask_labels = (
+            torch.rand([self.batch_size, self.num_labels, self.min_size, self.max_size], device=torch_device) > 0.5
+        ).float()
+        class_labels = (torch.rand((self.batch_size, self.num_labels), device=torch_device) > 0.5).long()
+
+        config = self.get_config()
+        return config, pixel_values, pixel_mask, mask_labels, class_labels
+
+    def get_config(self):
+        config = Mask2FormerConfig(
+            hidden_size=self.hidden_dim,
+        )
+        config.num_queries = self.num_queries
+        config.num_labels = self.num_labels
+
+        config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.num_channels = self.num_channels
+
+        config.encoder_feedforward_dim = 64
+        config.dim_feedforward = 128
+        config.hidden_dim = self.hidden_dim
+        config.mask_feature_size = self.hidden_dim
+        config.feature_size = self.hidden_dim
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, _, _ = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def check_output_hidden_state(self, output, config):
+        encoder_hidden_states = output.encoder_hidden_states
+        pixel_decoder_hidden_states = output.pixel_decoder_hidden_states
+        transformer_decoder_hidden_states = output.transformer_decoder_hidden_states
+
+        self.parent.assertTrue(len(encoder_hidden_states), len(config.backbone_config.depths))
+        self.parent.assertTrue(len(pixel_decoder_hidden_states), len(config.backbone_config.depths))
+        self.parent.assertTrue(len(transformer_decoder_hidden_states), config.decoder_layers)
+
+    def create_and_check_mask2former_model(self, config, pixel_values, pixel_mask, output_hidden_states=False):
+        with torch.no_grad():
+            model = Mask2FormerModel(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            output = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+            output = model(pixel_values, output_hidden_states=True)
+
+        self.parent.assertEqual(
+            output.transformer_decoder_last_hidden_state.shape,
+            (self.batch_size, self.num_queries, self.hidden_dim),
+        )
+        # let's ensure the other two hidden state exists
+        self.parent.assertTrue(output.pixel_decoder_last_hidden_state is not None)
+        self.parent.assertTrue(output.encoder_last_hidden_state is not None)
+
+        if output_hidden_states:
+            self.check_output_hidden_state(output, config)
+
+    def create_and_check_mask2former_instance_segmentation_head_model(
+        self, config, pixel_values, pixel_mask, mask_labels, class_labels
+    ):
+        model = Mask2FormerForUniversalSegmentation(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        def comm_check_on_output(result):
+            # let's still check that all the required stuff is there
+            self.parent.assertTrue(result.transformer_decoder_last_hidden_state is not None)
+            self.parent.assertTrue(result.pixel_decoder_last_hidden_state is not None)
+            self.parent.assertTrue(result.encoder_last_hidden_state is not None)
+            # okay, now we need to check the logits shape
+            # due to the encoder compression, masks have a //4 spatial size
+            self.parent.assertEqual(
+                result.masks_queries_logits.shape,
+                (self.batch_size, self.num_queries, self.min_size // 4, self.max_size // 4),
+            )
+            # + 1 for null class
+            self.parent.assertEqual(
+                result.class_queries_logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)
+            )
+
+        with torch.no_grad():
+            result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+            result = model(pixel_values)
+
+            comm_check_on_output(result)
+
+            result = model(
+                pixel_values=pixel_values, pixel_mask=pixel_mask, mask_labels=mask_labels, class_labels=class_labels
+            )
+
+        comm_check_on_output(result)
+
+        self.parent.assertTrue(result.loss is not None)
+        self.parent.assertEqual(result.loss.shape, torch.Size([1]))
+
+
+@require_torch
+class Mask2FormerModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (Mask2FormerModel, Mask2FormerForUniversalSegmentation) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = Mask2FormerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Mask2FormerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mask2former_model(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_mask2former_model(config, **inputs, output_hidden_states=False)
+
+    def test_mask2former_instance_segmentation_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mask2former_instance_segmentation_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="Mask2Former does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Mask2Former does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="Mask2Former is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Mask2Former does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Mask2Former has some layers using `add_module` which doesn't work well with `nn.DataParallel`"
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ["facebook/mask2former-swin-small-coco-instance"]:
+            model = Mask2FormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_model_with_labels(self):
+        size = (self.model_tester.min_size,) * 2
+        inputs = {
+            "pixel_values": torch.randn((2, 3, *size), device=torch_device),
+            "mask_labels": torch.randn((2, 10, *size), device=torch_device),
+            "class_labels": torch.zeros(2, 10, device=torch_device).long(),
+        }
+        config = self.model_tester.get_config()
+
+        model = Mask2FormerForUniversalSegmentation(config).to(torch_device)
+        outputs = model(**inputs)
+        self.assertTrue(outputs.loss is not None)
+
+    def test_hidden_states_output(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_mask2former_model(config, **inputs, output_hidden_states=True)
+
+    def test_attention_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            outputs = model(**inputs, output_attentions=True)
+            self.assertTrue(outputs.attentions is not None)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        model_class = self.all_model_classes[1]
+        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+
+        loss = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels).loss
+        loss.backward()
+
+    def test_retain_grad_hidden_states_attentions(self):
+        model_class = self.all_model_classes[1]
+        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        model = model_class(config).to(torch_device)
+        model.train()
+
+        outputs = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels)
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+
+        pixel_decoder_hidden_states = outputs.pixel_decoder_hidden_states[0]
+        pixel_decoder_hidden_states.retain_grad()
+
+        transformer_decoder_hidden_states = outputs.transformer_decoder_hidden_states[0]
+        transformer_decoder_hidden_states.retain_grad()
+
+        attentions = outputs.attentions[0]
+        attentions.retain_grad()
+
+        outputs.loss.backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(pixel_decoder_hidden_states.grad)
+        self.assertIsNotNone(transformer_decoder_hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_vision
+@slow
+class Mask2FormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def model_checkpoints(self):
+        return "facebook/mask2former-swin-small-coco-instance"
+
+    @cached_property
+    def default_feature_extractor(self):
+        return MaskFormerImageProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
+
+    def test_inference_no_head(self):
+        model = Mask2FormerModel.from_pretrained(self.model_checkpoints).to(torch_device)
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs_shape = inputs["pixel_values"].shape
+        # check size is divisible by 32
+        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
+        # check size
+        self.assertEqual(inputs_shape, (1, 3, 384, 384))
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_slice_hidden_state = torch.tensor(
+            [[-0.2790, -1.0717, -1.1668], [-0.5128, -0.3128, -0.4987], [-0.5832, 0.1971, -0.0197]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.encoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+        expected_slice_hidden_state = torch.tensor(
+            [[0.8973, 1.1847, 1.1776], [1.1934, 1.5040, 1.5128], [1.1153, 1.4486, 1.4951]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.pixel_decoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+        expected_slice_hidden_state = torch.tensor(
+            [[2.1152, 1.7000, -0.8603], [1.5808, 1.8004, -0.9353], [1.6043, 1.7495, -0.5999]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(
+                outputs.transformer_decoder_last_hidden_state[0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
+            )
+        )
+
+    def test_inference_universal_segmentation_head(self):
+        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs_shape = inputs["pixel_values"].shape
+        # check size is divisible by 32
+        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
+        # check size
+        self.assertEqual(inputs_shape, (1, 3, 384, 384))
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # masks_queries_logits
+        masks_queries_logits = outputs.masks_queries_logits
+        self.assertEqual(
+            masks_queries_logits.shape, (1, model.config.num_queries, inputs_shape[-2] // 4, inputs_shape[-1] // 4)
+        )
+        expected_slice = [
+            [-8.7839, -9.0056, -8.8121],
+            [-7.4104, -7.0313, -6.5401],
+            [-6.6105, -6.3427, -6.4675],
+        ]
+        expected_slice = torch.tensor(expected_slice).to(torch_device)
+        self.assertTrue(torch.allclose(masks_queries_logits[0, 0, :3, :3], expected_slice, atol=TOLERANCE))
+        # class_queries_logits
+        class_queries_logits = outputs.class_queries_logits
+        self.assertEqual(class_queries_logits.shape, (1, model.config.num_queries, model.config.num_labels + 1))
+        expected_slice = torch.tensor(
+            [
+                [1.8324, -8.0835, -4.1922],
+                [0.8450, -9.0050, -3.6053],
+                [0.3045, -7.7293, -3.0275],
+            ]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_with_segmentation_maps_and_loss(self):
+        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
+        feature_extractor = self.default_feature_extractor
+
+        inputs = feature_extractor(
+            [np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
+            segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
+            return_tensors="pt",
+        )
+
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch_device)
+        inputs["mask_labels"] = [el.to(torch_device) for el in inputs["mask_labels"]]
+        inputs["class_labels"] = [el.to(torch_device) for el in inputs["class_labels"]]
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        self.assertTrue(outputs.loss is not None)