Deprecate low use models (#30781)

* Deprecate models - graphormer - time_series_transformer - xlm_prophetnet - qdqbert - nat - ernie_m - tvlt - nezha - mega - jukebox - vit_hybrid - x_clip - deta - speech_to_text_2 - efficientformer - realm - gptsan_japanese * Fix up * Fix speech2text2 imports * Make sure message isn't indented * Fix docstrings * Correctly map for deprecated models from model_type * Uncomment out * Add back time series transformer and x-clip * Import fix and fix-up * Fix up with updated ruff

Deprecate low use models (#30781)
* Deprecate models - graphormer - time_series_transformer - xlm_prophetnet - qdqbert - nat - ernie_m - tvlt - nezha - mega - jukebox - vit_hybrid - x_clip - deta - speech_to_text_2 - efficientformer - realm - gptsan_japanese * Fix up * Fix speech2text2 imports * Make sure message isn't indented * Fix docstrings * Correctly map for deprecated models from model_type * Uncomment out * Add back time series transformer and x-clip * Import fix and fix-up * Fix up with updated ruff
a564d10a · amyeroberts · GitHub · 7f08817b · 7f08817b · 7f08817b
Unverified Commit a564d10a authored May 28, 2024 by amyeroberts Committed by GitHub May 28, 2024
20 changed files
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch EfficientFormer model."""
-
-import unittest
-import warnings
-from typing import List
-
-from transformers import EfficientFormerConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        EfficientFormerForImageClassification,
-        EfficientFormerForImageClassificationWithTeacher,
-        EfficientFormerModel,
-    )
-    from transformers.models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_MAPPING_NAMES,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import EfficientFormerImageProcessor
-
-
-class EfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = EfficientFormerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = EfficientFormerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = EfficientFormerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class EfficientFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as EfficientFormer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            EfficientFormerModel,
-            EfficientFormerForImageClassificationWithTeacher,
-            EfficientFormerForImageClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": EfficientFormerModel,
-            "image-classification": (
-                EfficientFormerForImageClassification,
-                EfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = EfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # special case for EfficientFormerForImageClassificationWithTeacher model
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            # EfficientFormerForImageClassificationWithTeacher supports inference-only
-            if (
-                model_class.__name__ in MODEL_MAPPING_NAMES.values()
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
-            {"title": "regression", "num_labels": 1, "dtype": torch.float},
-        ]
-
-        for model_class in self.all_model_classes:
-            if (
-                model_class.__name__
-                not in [
-                    *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(),
-                ]
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.to(torch_device)
-                    model.train()
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
-
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-                    loss.backward()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "snap-research/efficientformer-l1-300"
-        model = EfficientFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = EfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300").to(
-            torch_device
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.0555, 0.4825, -0.0852]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = EfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        ).to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.1312, 0.4353, -1.0499]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
--- a/tests/models/efficientformer/test_modeling_tf_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow EfficientFormer model."""
-
-import inspect
-import unittest
-from typing import List
-
-import numpy as np
-
-from transformers import EfficientFormerConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFEfficientFormerForImageClassification,
-        TFEfficientFormerForImageClassificationWithTeacher,
-        TFEfficientFormerModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import EfficientFormerImageProcessor
-
-
-class TFEfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFEfficientFormerModel(config=config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFEfficientFormerForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFEfficientFormerForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_tf_common.py, as EfficientFormer does not use input_ids,
-    inputs_embeds, attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            TFEfficientFormerModel,
-            TFEfficientFormerForImageClassificationWithTeacher,
-            TFEfficientFormerForImageClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFEfficientFormerModel,
-            "image-classification": (
-                TFEfficientFormerForImageClassification,
-                TFEfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFEfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.asseretIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "TFEfficientFormerForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "snap-research/efficientformer-l1-300"
-        model = TFEfficientFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    def test_compile_tf_model(self):
-        # We use a simplified version of this test for EfficientFormer because it requires training=False
-        # and Keras refuses to let us force that during functional construction
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Prepare our model
-            model = model_class(config)
-            # These are maximally general inputs for the model, with multiple None dimensions
-            # Hopefully this will catch any conditionals that fail for flexible shapes
-            functional_inputs = {
-                key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
-                for key, val in model.input_signature.items()
-                if key in model.dummy_inputs
-            }
-            outputs_dict = model(functional_inputs)
-            self.assertTrue(outputs_dict is not None)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFEfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-        # forward pass
-        outputs = model(**inputs, training=False)
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.0555, 0.4825, -0.0852])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = TFEfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-        # forward pass
-        outputs = model(**inputs, training=False)
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.1312, 0.4353, -1.0499])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
--- a/tests/models/ernie_m/__init__.py
+++ b/tests/models/ernie_m/__init__.py
--- a/tests/models/ernie_m/test_modeling_ernie_m.py
+++ b/tests/models/ernie_m/test_modeling_ernie_m.py
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ErnieM model."""
-
-import unittest
-
-from transformers import ErnieMConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        ErnieMForInformationExtraction,
-        ErnieMForMultipleChoice,
-        ErnieMForQuestionAnswering,
-        ErnieMForSequenceClassification,
-        ErnieMForTokenClassification,
-        ErnieMModel,
-    )
-
-
-class ErnieMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_uiem(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return ErnieMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = ErnieMModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, return_dict=True)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_information_extraction(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForInformationExtraction(config=config)
-        model.to(torch_device)
-        model.eval()
-        sequence_labels = torch.ones_like(input_ids, dtype=torch.float32)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        input_ids.to(torch_device)
-        input_mask.to(torch_device)
-        token_labels.to(torch_device)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ErnieMForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class ErnieMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ErnieMModel,
-            ErnieMForMultipleChoice,
-            ErnieMForQuestionAnswering,
-            ErnieMForSequenceClassification,
-            ErnieMForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ErnieMModel,
-            "question-answering": ErnieMForQuestionAnswering,
-            "text-classification": ErnieMForSequenceClassification,
-            "token-classification": ErnieMForTokenClassification,
-            "zero-shot": ErnieMForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_torchscript = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ErnieMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_information_extraction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_information_extraction(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "susnato/ernie-m-base_pytorch"
-        model = ErnieMModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class ErnieMModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_model(self):
-        model = ErnieMModel.from_pretrained("susnato/ernie-m-base_pytorch")
-        model.eval()
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        hidden_size = 768
-
-        expected_shape = torch.Size((1, 6, hidden_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[[-0.0012, 0.1245, -0.0214], [-0.0742, 0.0244, -0.0771], [-0.0333, 0.1164, -0.1554]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
--- a/tests/models/ernie_m/test_tokenization_ernie_m.py
+++ b/tests/models/ernie_m/test_tokenization_ernie_m.py
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ErnieM model."""
-
-import unittest
-
-from transformers import ErnieMTokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "susnato/ernie-m-base_pytorch"
-    tokenizer_class = ErnieMTokenizer
-    test_seq2seq = False
-    test_sentencepiece = True
-    test_rust_tokenizer = False
-    test_sentencepiece_ignore_case = False
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<pad>")
-        self.assertEqual(vocab_keys[1], "<unk>")
-        self.assertEqual(vocab_keys[-1], "▁eloquent")
-        self.assertEqual(len(vocab_keys), 30_000)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        # ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
-        self.assertListEqual(
-            tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
-            tokenizer.sep_token_id
-        ] + text_2 + [tokenizer.sep_token_id]
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="susnato/ernie-m-base_pytorch",
-            sequences=[
-                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-                "Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
-                "models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-                "conditioning on both left and right context in all layers.",
-                "The quick brown fox jumps over the lazy dog.",
-            ],
-        )
--- a/tests/models/gptsan_japanese/__init__.py
+++ b/tests/models/gptsan_japanese/__init__.py
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import (
-    GPTSanJapaneseConfig,
-    GPTSanJapaneseForConditionalGeneration,
-    GPTSanJapaneseModel,
-    GPTSanJapaneseTokenizer,
-    is_torch_available,
-)
-from transformers.generation import GenerationConfig
-from transformers.testing_utils import require_torch, slow, tooslow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-class GPTSanJapaneseTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        num_contexts=7,
-        # For common tests
-        is_training=True,
-        hidden_size=32,
-        ext_size=42,
-        num_hidden_layers=2,
-        num_ext_layers=2,
-        num_attention_heads=4,
-        num_experts=2,
-        d_ff=32,
-        d_ext=80,
-        d_spout=33,
-        dropout_rate=0.0,
-        layer_norm_epsilon=1e-6,
-        expert_capacity=100,
-        router_jitter_noise=0.0,
-    ):
-        self.vocab_size = vocab_size
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_contexts = num_contexts
-        # For common tests
-        self.seq_length = self.num_contexts
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_ext_layers = num_ext_layers
-        self.ext_size = ext_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_experts = num_experts
-        self.d_ff = d_ff
-        self.d_ext = d_ext
-        self.d_spout = d_spout
-        self.dropout_rate = dropout_rate
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.expert_capacity = expert_capacity
-        self.router_jitter_noise = router_jitter_noise
-
-    def get_large_model_config(self):
-        return GPTSanJapaneseConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, input_ids)
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, {"input_ids": input_ids})
-
-    def get_config(self):
-        return GPTSanJapaneseConfig(
-            vocab_size=self.vocab_size,
-            num_contexts=self.seq_length,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_ext=self.d_ext,
-            d_spout=self.d_spout,
-            num_switch_layers=self.num_hidden_layers - self.num_ext_layers,
-            num_ext_layers=self.num_ext_layers,
-            num_heads=self.num_attention_heads,
-            num_experts=self.num_experts,
-            expert_capacity=self.expert_capacity,
-            dropout_rate=self.dropout_rate,
-            layer_norm_epsilon=self.layer_norm_epsilon,
-            router_jitter_noise=self.router_jitter_noise,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-    ):
-        model = GPTSanJapaneseForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-        )
-        self.parent.assertIsNotNone(result)
-
-
-@require_torch
-class GPTSanJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": GPTSanJapaneseForConditionalGeneration,
-            "feature-extraction": GPTSanJapaneseForConditionalGeneration,
-            "summarization": GPTSanJapaneseForConditionalGeneration,
-            "text2text-generation": GPTSanJapaneseForConditionalGeneration,
-            "translation": GPTSanJapaneseForConditionalGeneration,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    test_save_load_fast_init_to_base = False
-    test_training = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "SummarizationPipelineTests":
-            # TODO: fix `_reorder_cache` is not implemented for this model
-            return True
-        elif pipeline_test_casse_name == "Text2TextGenerationPipelineTests":
-            # TODO: check this.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-
-@require_torch
-class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseForConditionalGeneration,) if is_torch_available() else ()
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @slow
-    def test_logits(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        input_ids = tokenizer.encode("武田信玄は", return_tensors="pt")
-        outputs = model(input_ids)
-        output_logits = outputs.logits.detach().cpu().numpy()
-        # Output of original model created with mesh-tensoflow
-        # fmt: off
-        target = [
-            [-12.037839889526367, -12.433061599731445, -14.333840370178223, -12.450345993041992, -11.1661376953125,
-            -11.930137634277344, -10.659740447998047, -12.909574508666992, -13.241043090820312, -13.398579597473145,
-            -11.107524871826172, -12.3685941696167, -22.97943115234375, -10.481067657470703, -12.484030723571777,
-            -12.807360649108887, -14.769700050354004, -12.233579635620117, -13.428145408630371, -22.624177932739258],
-            [-7.511149883270264, -8.281851768493652, -7.943127155303955, -7.55021333694458, -6.49869966506958,
-            -7.586796283721924, -6.978085994720459, -7.839145183563232, -8.21964168548584, -8.695091247558594,
-            -6.706910610198975, -6.6585798263549805, -19.565698623657227, -5.353842735290527, -8.350686073303223,
-            -8.039388656616211, -10.856569290161133, -7.75154447555542, -8.819022178649902, -19.51532745361328],
-            [-9.73066234588623, -10.223922729492188, -9.932981491088867, -11.857836723327637, -7.662626266479492,
-            -11.13529109954834, -7.765097618103027, -11.472923278808594, -9.543149948120117, -11.905633926391602,
-            -9.366164207458496, -11.5734281539917, -23.699003219604492, -9.429590225219727, -10.42839241027832,
-            -10.585240364074707, -10.94771957397461, -11.095416069030762, -10.390240669250488, -23.769372940063477],
-            [-9.728265762329102, -9.859712600708008, -10.09729290008545, -9.678522109985352, -6.879519939422607,
-            -9.68487548828125, -4.2803425788879395, -10.018914222717285, -9.308445930480957, -10.63394546508789,
-            -8.083646774291992, -9.06301498413086, -21.904266357421875, -8.90160846710205, -8.841876029968262,
-            -11.856719970703125, -12.079398155212402, -11.233753204345703, -10.177338600158691, -21.87256622314453],
-            [-9.669764518737793, -9.614198684692383, -9.814510345458984, -9.996501922607422, -11.375690460205078,
-            -10.113405227661133, -10.546867370605469, -10.04369068145752, -10.907809257507324, -10.504216194152832,
-            -11.129199028015137, -10.151124000549316, -21.96586799621582, -9.086349487304688, -11.730339050292969,
-            -10.460667610168457, -10.298049926757812, -10.784148216247559, -10.840693473815918, -22.03152847290039],
-        ]
-        # fmt: on
-        target = np.array(target).flatten()
-        predict = output_logits[0, :, :20].flatten()
-
-        def check(a, b, epsilon=5e-4):
-            return abs(a - b) < epsilon * max(abs(a), abs(b))
-
-        self.assertTrue(np.all([check(target[i], predict[i]) for i in range(len(target))]))
-
-    @slow
-    def test_batch_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        # use different length sentences to test batching
-        sentences = [
-            "甲斐なら武田と言うほど",
-            "織田信長は、",
-        ]
-
-        tokenizer.padding_side = "left"
-        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-        input_ids = inputs["input_ids"].to(torch_device)
-
-        self.assertNotEqual(inputs["attention_mask"][0].numpy().tolist(), inputs["attention_mask"][1].numpy().tolist())
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-            max_new_tokens=3,
-            generation_config=generation_config,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(
-            input_ids=inputs_non_padded, max_new_tokens=3, generation_config=generation_config
-        )
-
-        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=3, generation_config=generation_config)
-
-        self.assertNotEqual(inputs_non_padded.shape, inputs_padded.shape)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "甲斐なら武田と言うほど甲斐の武田",
-            "織田信長は、このような",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
-
-    @tooslow
-    def test_sample(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        # Output of original model created with mesh-tensoflow
-        target = [
-            ("武田信玄は", 35675),
-            ("武田信玄は、", 45),
-            ("武田信玄は、この", 29),
-            ("武田信玄は、このよう", 30642),
-            ("武田信玄は、このような", 35680),
-            ("武田信玄は、このような「", 8640),
-            ("武田信玄は、このような「武田", 31617),
-            ("武田信玄は、このような「武田家", 30646),
-            ("武田信玄は、このような「武田家の", 31617),
-            ("武田信玄は、このような「武田家の家", 31381),
-        ]
-        for input, output in target:
-            input_ids = tokenizer.encode(input, return_tensors="pt")
-            outputs = model(input_ids)
-            output_logits = outputs.logits.detach().cpu().numpy()[0]
-            output_id = np.argmax(output_logits[-1])
-            self.assertEqual(output_id, output)
-
-    @slow
-    def test_spout_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        input_text = "武田信玄は、"
-        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(torch_device)
-        input_ids_batch = tokenizer([input_text, input_text], return_tensors="pt").input_ids.to(torch_device)
-
-        # spout from uniform and one-hot
-
-        spouts = [
-            [0.87882208, 0.38426396, 0.33220248, 0.43890406, 0.16562252,
-            0.04803985, 0.211572  , 0.23188473, 0.37153068, 0.7836377 ,
-            0.02160172, 0.38761719, 0.75290772, 0.90198857, 0.34365777,
-            0.64168169, 0.44318471, 0.14575746, 0.92562881, 0.40812148,
-            0.29019122, 0.88861599, 0.65524846, 0.43563456, 0.38177187,
-            0.70832965, 0.81527892, 0.68832812, 0.38833192, 0.4561522 ,
-            0.14828817, 0.47248213, 0.54357335, 0.82009566, 0.1338884 ,
-            0.02755417, 0.19764677, 0.2422084 , 0.04757674, 0.65409606,
-            0.0824589 , 0.03304383, 0.94387689, 0.98764509, 0.82433901,
-            0.27646741, 0.64907493, 0.76009406, 0.30087915, 0.17904689,
-            0.41601714, 0.67046398, 0.10422822, 0.08447374, 0.07354344,
-            0.61423565, 0.70284866, 0.7532333 , 0.1972038 , 0.29575659,
-            0.90583886, 0.29265307, 0.50000175, 0.70407655, 0.889363  ,
-            0.81904418, 0.66829128, 0.64468815, 0.56563723, 0.85601875,
-            0.94924672, 0.00166762, 0.25220643, 0.74540219, 0.67993247,
-            0.1549675 , 0.39385352, 0.92153607, 0.63745931, 0.27759043,
-            0.84702295, 0.65904271, 0.58676614, 0.8666936 , 0.39607438,
-            0.79954983, 0.42220697, 0.39650381, 0.7849864 , 0.56150201,
-            0.15678925, 0.14746032, 0.34542114, 0.47026783, 0.11956489,
-            0.25421435, 0.33788901, 0.68934842, 0.36424685, 0.71737898,
-            0.38983449, 0.94393779, 0.39575588, 0.36616553, 0.87104665,
-            0.64630203, 0.22516905, 0.88270804, 0.15031338, 0.75144345,
-            0.46459025, 0.85396454, 0.86355643, 0.65139851, 0.70266061,
-            0.30241389, 0.81056497, 0.88865969, 0.38773807, 0.70635849,
-            0.90718459, 0.43245789, 0.28000654, 0.45935562, 0.08773519,
-            0.9552151 , 0.93901511, 0.22489288], # uniform
-            [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0.],
-        ]  # fmt: skip
-
-        output1 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[0],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[1],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_ids_batch,
-            spout=spouts,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の滅亡後、武田氏の居城であった甲斐武田氏の居城である",
-            "武田信玄は、武田家の滅亡を防ぐため、武田家の家臣である武田信虎を討",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
-
-    @slow
-    def test_prefix_lm_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        prefix_text_1 = "武田信玄"
-        prefix_text_2 = "織田信長"
-        input_text_1 = "は、"
-        input_text_2 = "が、"
-        input_tok_1 = tokenizer(input_text_1, prefix_text=prefix_text_1, return_tensors="pt")
-        input_tok_2 = tokenizer(input_text_2, prefix_text=prefix_text_2, return_tensors="pt")
-        input_tok_3 = tokenizer([[prefix_text_1, input_text_1], [prefix_text_2, input_text_2]], return_tensors="pt")
-
-        output1 = model.generate(
-            input_ids=input_tok_1.input_ids.to(torch_device),
-            token_type_ids=input_tok_1.token_type_ids.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_tok_2.input_ids.to(torch_device),
-            token_type_ids=input_tok_2.token_type_ids.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_tok_3.input_ids.to(torch_device),
-            token_type_ids=input_tok_3.token_type_ids.to(torch_device),
-            attention_mask=input_tok_3.attention_mask.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の祖である武田信虎を、その子・武田信友を擁して",
-            "織田信長が、織田信長の妻・お市の方を妻として迎えたという逸話が残",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
--- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.models.gptsan_japanese.tokenization_gptsan_japanese import (
-    VOCAB_FILES_NAMES,
-    GPTSanJapaneseTokenizer,
-)
-from transformers.testing_utils import require_jinja, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class GPTSanJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "Tanrei/GPTSAN-japanese"
-    tokenizer_class = GPTSanJapaneseTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = ["こん", "こんに", "にちは", "ばんは", "世界,㔺界", "、", "。", "<BR>", "<SP>", "<TAB>", "<URL>", "<EMAIL>", "<TEL>", "<DATE>", "<PRICE>", "<BLOCK>", "<KIGOU>", "<U2000U2BFF>", "<|emoji1|>", "<unk>", "<|bagoftoken|>", "<|endoftext|>"]  # fmt: skip
-        emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}}  # 😀
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        with open(self.emoji_file, "w") as emoji_writer:
-            emoji_writer.write(json.dumps(emoji_tokens))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPTSanJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_input_output_texts
-    def get_input_output_texts(self, tokenizer):
-        input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
-        output_text = "こんにちは、世界。 \nこんばんは、世界。😀"
-        return input_text, output_text
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_clean_sequence
-    def get_clean_sequence(self, tokenizer):
-        input_text, output_text = self.get_input_output_texts(tokenizer)
-        ids = tokenizer.encode(output_text, add_special_tokens=False)
-        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
-        return text, ids
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_pretokenized_inputs
-    def test_pretokenized_inputs(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_pair_input
-    def test_maximum_encoding_length_pair_input(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_single_input
-    def test_maximum_encoding_length_single_input(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_full_tokenizer
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-
-        # Testing tokenization
-        input_text = "こんにちは、世界。　こんばんは、㔺界。"
-        expected_token = ["こん", "にちは", "、", "世界", "。", "<SP>", "こん", "ばんは", "、", "㔺界", "。"]
-        tokens = tokenizer.tokenize(input_text)
-        self.assertListEqual(tokens, expected_token)
-
-        # Testing conversion to ids without special tokens
-        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6]
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(input_ids, expected_ids)
-
-        # Testing conversion to ids with special tokens
-        input_tokens = tokens + [tokenizer.unk_token]
-        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6, 19]
-        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
-        self.assertListEqual(input_ids, expected_ids)
-
-    def test_token_bagging(self):
-        tokenizer = self.get_tokenizer()
-
-        # Testing tokenization
-        input_text = "こんにちは、<|bagoftoken|>世界。こんばんは、<|bagoftoken|>㔺界。"
-        expected_text = "こんにちは、、、、世界。こんばんは、、、、世界。"
-        tokens = tokenizer.encode(input_text)
-        output_text = tokenizer.decode(tokens)
-        self.assertEqual(output_text, expected_text)
-
-    @slow
-    def test_prefix_input(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # Testing tokenization
-        prefix_text = "こんにちは、世界。"
-        input_text = "こんばんは、㔺界。😀"
-        expected_text = "こんにちは、世界。こんばんは、世界。😀"
-        tokens_1 = tokenizer.encode(prefix_text + input_text)
-        tokens_2 = tokenizer.encode("", prefix_text=prefix_text + input_text)
-        tokens_3 = tokenizer.encode(input_text, prefix_text=prefix_text)
-        output_text_1 = tokenizer.decode(tokens_1)
-        output_text_2 = tokenizer.decode(tokens_2)
-        output_text_3 = tokenizer.decode(tokens_3)
-        self.assertEqual(output_text_1, expected_text)
-        self.assertEqual(output_text_2, expected_text)
-        self.assertEqual(output_text_3, expected_text)
-
-    @slow
-    def test_token_type_ids(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # Testing tokenization
-        prefix_text = "こんにちは、世界。"
-        input_text = "こんばんは、㔺界。😀"
-
-        len_prefix = len(tokenizer.encode(prefix_text)) - 2
-        len_text = len(tokenizer.encode(input_text)) - 2
-
-        expected_mask_1 = [1] + [0] * (len_prefix + len_text + 1)
-        expected_mask_2 = [1] * (len_prefix + len_text + 1) + [0]
-        expected_mask_3 = [1] + [1] * (len_prefix) + [0] * (len_text + 1)
-
-        type_id_1 = tokenizer(prefix_text + input_text).token_type_ids
-        type_id_2 = tokenizer("", prefix_text=prefix_text + input_text).token_type_ids
-        type_id_3 = tokenizer(input_text, prefix_text=prefix_text).token_type_ids
-        self.assertListEqual(type_id_1, expected_mask_1)
-        self.assertListEqual(type_id_2, expected_mask_2)
-        self.assertListEqual(type_id_3, expected_mask_3)
-
-    @slow
-    def test_prefix_tokens(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        x_token_1 = tokenizer.encode("あンいワ")
-        x_token_2 = tokenizer.encode("", prefix_text="あンいワ")
-        x_token_3 = tokenizer.encode("いワ", prefix_text="あン")
-
-        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_2))
-        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_3))
-        self.assertNotEqual(x_token_1, x_token_2)
-        self.assertNotEqual(x_token_1, x_token_3)
-        self.assertEqual(x_token_1[1], x_token_2[-1])  # SEG token
-        self.assertEqual(x_token_1[1], x_token_3[3])  # SEG token
-
-    @slow
-    def test_batch_encode(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        input_pairs = [["武田信玄", "は、"], ["織田信長", "の配下の、"]]
-        x_token = tokenizer(input_pairs, padding=True)
-        x_token_2 = tokenizer.batch_encode_plus(input_pairs, padding=True)
-
-        # fmt: off
-        expected_outputs = [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
-        expected_typeids = [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
-        expected_attmask = [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
-        # fmt: on
-        self.assertListEqual(x_token.input_ids, expected_outputs)
-        self.assertListEqual(x_token.token_type_ids, expected_typeids)
-        self.assertListEqual(x_token.attention_mask, expected_attmask)
-        self.assertListEqual(x_token_2.input_ids, expected_outputs)
-        self.assertListEqual(x_token_2.token_type_ids, expected_typeids)
-        self.assertListEqual(x_token_2.attention_mask, expected_attmask)
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_conversion_reversible
-    def test_conversion_reversible(self):
-        # Intentionally convert some words to accommodate character fluctuations unique to Japanese
-        pass
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_padding_different_model_input_name
-    def test_padding_different_model_input_name(self):
-        # tokenizer has no padding token
-        pass
-
-    @require_jinja
-    def test_tokenization_for_chat(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-        # This is in English, but it's just here to make sure the chat control tokens are being added properly
-        test_chats = [
-            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
-            [
-                {"role": "system", "content": "You are a helpful chatbot."},
-                {"role": "user", "content": "Hello!"},
-                {"role": "assistant", "content": "Nice to meet you."},
-            ],
-            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
-        ]
-        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
-        # fmt: off
-        expected_tokens = [
-            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999],
-            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999],
-            [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999]
-        ]
-        # fmt: on
-        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
-            self.assertListEqual(tokenized_chat, expected_tokens)
--- a/tests/models/graphormer/__init__.py
+++ b/tests/models/graphormer/__init__.py
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Graphormer model."""
-
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-
-from transformers import GraphormerConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import tensor
-
-    from transformers import GraphormerForGraphClassification, GraphormerModel
-
-
-class GraphormerModelTester:
-    def __init__(
-        self,
-        parent,
-        num_classes=1,
-        num_atoms=32 * 9,
-        num_edges=32 * 3,
-        num_in_degree=32,
-        num_out_degree=32,
-        num_spatial=32,
-        num_edge_dis=16,
-        multi_hop_max_dist=5,  # sometimes is 20
-        spatial_pos_max=32,
-        edge_type="multi_hop",
-        init_fn=None,
-        max_nodes=32,
-        share_input_output_embed=False,
-        num_hidden_layers=2,
-        embedding_dim=32,
-        ffn_embedding_dim=32,
-        num_attention_heads=4,
-        dropout=0.1,
-        attention_dropout=0.1,
-        activation_dropout=0.1,
-        layerdrop=0.0,
-        encoder_normalize_before=False,
-        pre_layernorm=False,
-        apply_graphormer_init=False,
-        activation_fn="gelu",
-        embed_scale=None,
-        freeze_embeddings=False,
-        num_trans_layers_to_freeze=0,
-        traceable=False,
-        q_noise=0.0,
-        qn_block_size=8,
-        kdim=None,
-        vdim=None,
-        bias=True,
-        self_attention=True,
-        batch_size=10,
-        graph_size=20,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.num_classes = num_classes
-        self.num_labels = num_classes
-        self.num_atoms = num_atoms
-        self.num_in_degree = num_in_degree
-        self.num_out_degree = num_out_degree
-        self.num_edges = num_edges
-        self.num_spatial = num_spatial
-        self.num_edge_dis = num_edge_dis
-        self.edge_type = edge_type
-        self.multi_hop_max_dist = multi_hop_max_dist
-        self.spatial_pos_max = spatial_pos_max
-        self.max_nodes = max_nodes
-        self.num_hidden_layers = num_hidden_layers
-        self.embedding_dim = embedding_dim
-        self.hidden_size = embedding_dim
-        self.ffn_embedding_dim = ffn_embedding_dim
-        self.num_attention_heads = num_attention_heads
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.layerdrop = layerdrop
-        self.encoder_normalize_before = encoder_normalize_before
-        self.pre_layernorm = pre_layernorm
-        self.apply_graphormer_init = apply_graphormer_init
-        self.activation_fn = activation_fn
-        self.embed_scale = embed_scale
-        self.freeze_embeddings = freeze_embeddings
-        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
-        self.share_input_output_embed = share_input_output_embed
-        self.traceable = traceable
-        self.q_noise = q_noise
-        self.qn_block_size = qn_block_size
-        self.init_fn = init_fn
-        self.kdim = kdim
-        self.vdim = vdim
-        self.self_attention = self_attention
-        self.bias = bias
-        self.batch_size = batch_size
-        self.graph_size = graph_size
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        attn_bias = ids_tensor(
-            [self.batch_size, self.graph_size + 1, self.graph_size + 1], self.num_atoms
-        )  # Def not sure here
-        attn_edge_type = ids_tensor([self.batch_size, self.graph_size, self.graph_size, 1], self.num_edges)
-        spatial_pos = ids_tensor([self.batch_size, self.graph_size, self.graph_size], self.num_spatial)
-        in_degree = ids_tensor([self.batch_size, self.graph_size], self.num_in_degree)
-        out_degree = ids_tensor([self.batch_size, self.graph_size], self.num_out_degree)
-        input_nodes = ids_tensor([self.batch_size, self.graph_size, 1], self.num_atoms)
-        input_edges = ids_tensor(
-            [self.batch_size, self.graph_size, self.graph_size, self.multi_hop_max_dist, 1], self.num_edges
-        )
-        labels = ids_tensor([self.batch_size], self.num_classes)
-
-        config = self.get_config()
-
-        return config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-
-    def get_config(self):
-        return GraphormerConfig(
-            num_atoms=self.num_atoms,
-            num_in_degree=self.num_in_degree,
-            num_out_degree=self.num_out_degree,
-            num_edges=self.num_edges,
-            num_spatial=self.num_spatial,
-            num_edge_dis=self.num_edge_dis,
-            edge_type=self.edge_type,
-            multi_hop_max_dist=self.multi_hop_max_dist,
-            spatial_pos_max=self.spatial_pos_max,
-            max_nodes=self.max_nodes,
-            num_hidden_layers=self.num_hidden_layers,
-            embedding_dim=self.embedding_dim,
-            hidden_size=self.embedding_dim,
-            ffn_embedding_dim=self.ffn_embedding_dim,
-            num_attention_heads=self.num_attention_heads,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            activation_dropout=self.activation_dropout,
-            layerdrop=self.layerdrop,
-            encoder_normalize_before=self.encoder_normalize_before,
-            pre_layernorm=self.pre_layernorm,
-            apply_graphormer_init=self.apply_graphormer_init,
-            activation_fn=self.activation_fn,
-            embed_scale=self.embed_scale,
-            freeze_embeddings=self.freeze_embeddings,
-            num_trans_layers_to_freeze=self.num_trans_layers_to_freeze,
-            share_input_output_embed=self.share_input_output_embed,
-            traceable=self.traceable,
-            q_noise=self.q_noise,
-            qn_block_size=self.qn_block_size,
-            init_fn=self.init_fn,
-            kdim=self.kdim,
-            vdim=self.vdim,
-            self_attention=self.self_attention,
-            bias=self.bias,
-        )
-
-    def create_and_check_model(
-        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-    ):
-        model = GraphormerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_nodes=input_nodes,
-            attn_bias=attn_bias,
-            in_degree=in_degree,
-            out_degree=out_degree,
-            spatial_pos=spatial_pos,
-            input_edges=input_edges,
-            attn_edge_type=attn_edge_type,
-            labels=labels,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.graph_size + 1, self.hidden_size)
-        )
-
-    def create_and_check_for_graph_classification(
-        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-    ):
-        model = GraphormerForGraphClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_nodes=input_nodes,
-            attn_bias=attn_bias,
-            in_degree=in_degree,
-            out_degree=out_degree,
-            spatial_pos=spatial_pos,
-            input_edges=input_edges,
-            attn_edge_type=attn_edge_type,
-            labels=labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            attn_bias,
-            attn_edge_type,
-            spatial_pos,
-            in_degree,
-            out_degree,
-            input_nodes,
-            input_edges,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "attn_bias": attn_bias,
-            "attn_edge_type": attn_edge_type,
-            "spatial_pos": spatial_pos,
-            "in_degree": in_degree,
-            "out_degree": out_degree,
-            "input_nodes": input_nodes,
-            "input_edges": input_edges,
-            "labels": labels,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class GraphormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GraphormerForGraphClassification, GraphormerModel) if is_torch_available() else ()
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {"feature-extraction": GraphormerModel} if is_torch_available() else {}
-    test_pruning = False
-    test_head_masking = False
-    test_resize_embeddings = False
-    main_input_name_nodes = "input_nodes"
-    main_input_name_edges = "input_edges"
-    has_attentions = False  # does not output attention
-
-    def setUp(self):
-        self.model_tester = GraphormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GraphormerConfig, has_text_modality=False)
-
-    # overwrite from common as `Graphormer` requires more input arguments
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            try:
-                required_keys = (
-                    "input_nodes",
-                    "input_edges",
-                    "attn_bias",
-                    "in_degree",
-                    "out_degree",
-                    "spatial_pos",
-                    "attn_edge_type",
-                )
-                required_inputs = tuple(inputs[k] for k in required_keys)
-                model(*required_inputs)
-                traced_model = torch.jit.trace(model, required_inputs)
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                if layer_name in loaded_model_state_dict:
-                    p2 = loaded_model_state_dict[layer_name]
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-
-            self.assertTrue(models_equal)
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Graphormer does not use one single inputs_embedding but three")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not implement feed forward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not share input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_initialization(self):
-        def _config_zero_init(config):
-            configs_no_init = copy.deepcopy(config)
-            for key in configs_no_init.__dict__.keys():
-                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-                    setattr(configs_no_init, key, 1e-10)
-            return configs_no_init
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertTrue(
-                        -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            batch_size = self.model_tester.batch_size
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [batch_size, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Always returns hidden_states
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = False
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        outputs = model(**inputs_dict)
-        output = outputs[0]
-
-        hidden_states = outputs.hidden_states[0]
-        hidden_states.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-    # Inputs are 'input_nodes' and 'input_edges' not 'input_ids'
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name_nodes = list(model_signature.parameters.keys())[1]
-            observed_main_input_name_edges = list(model_signature.parameters.keys())[2]
-            self.assertEqual(model_class.main_input_name_nodes, observed_main_input_name_nodes)
-            self.assertEqual(model_class.main_input_name_edges, observed_main_input_name_edges)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_nodes", "input_edges"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_graph_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "clefourrier/graphormer-base-pcqm4mv1"
-        model = GraphormerForGraphClassification.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class GraphormerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_graph_classification(self):
-        model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2")
-
-        # Actual real graph data from the MUTAG dataset
-        # fmt: off
-        model_input = {
-            "attn_bias": tensor(
-                [
-                    [
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                    ],
-                    [
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                    ],
-                ]
-            ),
-            "attn_edge_type": tensor(
-                [
-                    [
-                        [[0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [3], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
-                    ],
-                    [
-                        [[0], [3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                    ],
-                ]
-            ),
-            # fmt: on
-            "spatial_pos": tensor(
-                [
-                    [
-                        [1, 2, 3, 4, 3, 2, 4, 5, 6, 5, 6, 7, 8, 7, 9, 10, 10],
-                        [2, 1, 2, 3, 4, 3, 5, 6, 5, 4, 5, 6, 7, 6, 8, 9, 9],
-                        [3, 2, 1, 2, 3, 4, 4, 5, 4, 3, 4, 5, 6, 5, 7, 8, 8],
-                        [4, 3, 2, 1, 2, 3, 3, 4, 3, 2, 3, 4, 5, 4, 6, 7, 7],
-                        [3, 4, 3, 2, 1, 2, 2, 3, 4, 3, 4, 5, 6, 5, 7, 8, 8],
-                        [2, 3, 4, 3, 2, 1, 3, 4, 5, 4, 5, 6, 7, 6, 8, 9, 9],
-                        [4, 5, 4, 3, 2, 3, 1, 2, 3, 4, 5, 6, 5, 4, 6, 7, 7],
-                        [5, 6, 5, 4, 3, 4, 2, 1, 2, 3, 4, 5, 4, 3, 5, 6, 6],
-                        [6, 5, 4, 3, 4, 5, 3, 2, 1, 2, 3, 4, 3, 2, 4, 5, 5],
-                        [5, 4, 3, 2, 3, 4, 4, 3, 2, 1, 2, 3, 4, 3, 5, 6, 6],
-                        [6, 5, 4, 3, 4, 5, 5, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5],
-                        [7, 6, 5, 4, 5, 6, 6, 5, 4, 3, 2, 1, 2, 3, 3, 4, 4],
-                        [8, 7, 6, 5, 6, 7, 5, 4, 3, 4, 3, 2, 1, 2, 2, 3, 3],
-                        [7, 6, 5, 4, 5, 6, 4, 3, 2, 3, 4, 3, 2, 1, 3, 4, 4],
-                        [9, 8, 7, 6, 7, 8, 6, 5, 4, 5, 4, 3, 2, 3, 1, 2, 2],
-                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 1, 3],
-                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 3, 1],
-                    ],
-                    [
-                        [1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 4, 5, 5, 0, 0, 0, 0],
-                        [2, 1, 2, 3, 4, 5, 4, 3, 4, 3, 5, 6, 6, 0, 0, 0, 0],
-                        [3, 2, 1, 2, 3, 4, 3, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
-                        [4, 3, 2, 1, 2, 3, 4, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
-                        [5, 4, 3, 2, 1, 2, 3, 4, 5, 6, 6, 7, 7, 0, 0, 0, 0],
-                        [6, 5, 4, 3, 2, 1, 2, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
-                        [5, 4, 3, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
-                        [4, 3, 2, 3, 4, 3, 2, 1, 2, 3, 3, 4, 4, 0, 0, 0, 0],
-                        [3, 4, 3, 4, 5, 4, 3, 2, 1, 2, 2, 3, 3, 0, 0, 0, 0],
-                        [2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 3, 4, 4, 0, 0, 0, 0],
-                        [4, 5, 4, 5, 6, 5, 4, 3, 2, 3, 1, 2, 2, 0, 0, 0, 0],
-                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 1, 3, 0, 0, 0, 0],
-                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 3, 1, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    ],
-                ]
-            ),
-            "in_degree": tensor(
-                [
-                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
-                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
-                ]
-            ),
-            "out_degree": tensor(
-                [
-                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
-                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
-                ]
-            ),
-            "input_nodes": tensor(
-                [
-                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
-                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [0], [0], [0], [0]],
-                ]
-            ),
-            "input_edges": tensor(
-                [
-                    [
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                    ],
-                    [
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                    ],
-                ]
-            ),
-            "labels": tensor([1, 0]),
-        }
-
-        output = model(**model_input)["logits"]
-
-        expected_shape = torch.Size((2, 1))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_logs = torch.tensor(
-            [[7.6060], [7.4126]]
-        )
-
-        self.assertTrue(torch.allclose(output, expected_logs, atol=1e-4))
--- a/tests/models/jukebox/__init__.py
+++ b/tests/models/jukebox/__init__.py
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-from unittest import skip
-
-from transformers import is_torch_available
-from transformers.testing_utils import (
-    require_torch,
-    require_torch_accelerator,
-    require_torch_fp16,
-    slow,
-    torch_device,
-)
-from transformers.trainer_utils import set_seed
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import JukeboxModel, JukeboxPrior, JukeboxTokenizer
-
-
-@require_torch
-class Jukebox1bModelTester(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-1b-lyrics"
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    }
-    # fmt: off
-    EXPECTED_OUTPUT_2 = [
-        1864, 1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534,
-        1000, 1445, 1105, 1130, 967, 515, 1434, 1620, 534, 1495, 283, 1445,
-        333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883,
-        1405, 1276, 1455, 1228
-    ]
-
-    EXPECTED_OUTPUT_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653
-    ]
-
-    EXPECTED_OUTPUT_1 = [
-        1125, 1751, 697, 1776, 1141, 1476, 391, 697, 1125, 684, 867, 416,
-        844, 1372, 1274, 717, 1274, 844, 1299, 1419, 697, 1370, 317, 1125,
-        191, 1440, 1370, 1440, 1370, 282, 1621, 1370, 368, 349, 867, 1872,
-        1262, 869, 1728, 747
-    ]
-    EXPECTED_OUTPUT_1_PT_2 = [
-        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416
-    ]
-
-    EXPECTED_OUTPUT_0 = [
-        1755, 842, 307, 1843, 1022, 1395, 234, 1554, 806, 739, 1022, 442,
-        616, 556, 268, 1499, 933, 457, 1440, 1837, 755, 985, 308, 902,
-        293, 1443, 1671, 1141, 1533, 555, 1562, 1061, 287, 417, 1022, 2008,
-        1186, 1015, 1777, 268
-    ]
-    EXPECTED_OUTPUT_0_PT_2 = [
-        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842,
-        185, 417, 185, 842, 307, 842, 591, 842, 185, 842, 307, 842,
-        591, 842, 1353, 842, 185, 842, 591, 842, 591, 114, 591, 842,
-        185, 842, 591, 89
-    ]
-
-    EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
-
-    EXPECTED_PRIMED_0 = [
-        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
-        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
-        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
-        1907, 1788, 596, 1626
-    ]
-    EXPECTED_PRIMED_1 = [
-        1236, 1668, 1484, 1920, 1848, 1409, 139, 864, 1828, 1272, 1599, 824,
-        1672, 139, 555, 1484, 824, 1920, 555, 596, 1579, 1599, 1231, 1599,
-        1637, 1407, 212, 824, 1599, 116, 1433, 824, 258, 1599, 1433, 1895,
-        1063, 1433, 1433, 1599
-    ]
-    EXPECTED_PRIMED_2 = [
-        1684, 1873, 1119, 1189, 395, 611, 1901, 972, 890, 1337, 1392, 1927,
-        96, 972, 672, 780, 1119, 890, 158, 771, 1073, 1927, 353, 1331,
-        1269, 1459, 1333, 1645, 812, 1577, 1337, 606, 353, 981, 1466, 619,
-        197, 391, 302, 1930
-    ]
-    EXPECTED_VQVAE_ENCODE = [
-        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
-        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
-        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
-        1907, 1788, 596, 1626
-    ]
-    EXPECTED_VQVAE_DECODE = [
-        -0.0492, -0.0524, -0.0565, -0.0640, -0.0686, -0.0684, -0.0677, -0.0664,
-        -0.0605, -0.0490, -0.0330, -0.0168, -0.0083, -0.0075, -0.0051, 0.0025,
-        0.0136, 0.0261, 0.0386, 0.0497, 0.0580, 0.0599, 0.0583, 0.0614,
-        0.0740, 0.0889, 0.1023, 0.1162, 0.1211, 0.1212, 0.1251, 0.1336,
-        0.1502, 0.1686, 0.1883, 0.2148, 0.2363, 0.2458, 0.2507, 0.2531
-    ]
-    EXPECTED_AUDIO_COND = [
-        0.0256, -0.0544, 0.1600, -0.0032, 0.1066, 0.0825, -0.0013, 0.3440,
-        0.0210, 0.0412, -0.1777, -0.0892, -0.0164, 0.0285, -0.0613, -0.0617,
-        -0.0137, -0.0201, -0.0175, 0.0215, -0.0627, 0.0520, -0.0730, 0.0970,
-        -0.0100, 0.0442, -0.0586, 0.0207, -0.0015, -0.0082
-    ]
-    EXPECTED_META_COND = [
-        0.0415, 0.0877, 0.0022, -0.0055, 0.0751, 0.0334, 0.0324, -0.0068,
-        0.0011, 0.0017, -0.0676, 0.0655, -0.0143, 0.0399, 0.0303, 0.0743,
-        -0.0168, -0.0394, -0.1113, 0.0124, 0.0442, 0.0267, -0.0003, -0.1536,
-        -0.0116, -0.1837, -0.0180, -0.1026, -0.0777, -0.0456
-    ]
-    EXPECTED_LYRIC_COND = [
-        76, 27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45, 45, 76, 38, 31, 33,
-        45, 76, 41, 32, 76, 45, 46, 41, 40, 31, 78, 76
-    ]
-    # fmt: on
-
-    def prepare_inputs(self):
-        tokenizer = JukeboxTokenizer.from_pretrained(self.model_id)
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens
-
-    @slow
-    def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = self.prepare_inputs()
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
-
-    @slow
-    def test_conditioning(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-
-        labels = self.prepare_inputs()
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
-
-        top_prior = model.priors[0]
-        start = 0
-        music_token_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
-        metadata = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
-
-        self.assertIsNone(music_token_conds)
-        self.assertListEqual(metadata.numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
-
-        audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata)
-        torch.testing.assert_close(
-            audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol=1e-4, rtol=1e-4
-        )
-        torch.testing.assert_close(
-            metadata_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_META_COND), atol=1e-4, rtol=1e-4
-        )
-        torch.testing.assert_close(
-            lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
-        )
-
-    @slow
-    def test_primed_sampling(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        set_seed(0)
-        waveform = torch.rand((1, 5120, 1))
-        tokens = list(self.prepare_inputs())
-
-        zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0], None, None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[0][0][:40], torch.tensor(self.EXPECTED_PRIMED_0))
-
-        upper_2 = torch.cat((zs[0], torch.zeros(1, 2048 - zs[0].shape[-1])), dim=-1).long()
-        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0], None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[1].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[1][0][:40], torch.tensor(self.EXPECTED_PRIMED_1))
-
-        upper_1 = torch.cat((zs[1], torch.zeros(1, 2048 - zs[1].shape[-1])), dim=-1).long()
-        zs = [upper_2, upper_1, model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0]]
-        zs = model._sample(
-            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
-
-    @slow
-    def test_vqvae(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        set_seed(0)
-        x = torch.rand((1, 5120, 1))
-        with torch.no_grad():
-            zs = model.vqvae.encode(x, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_close(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
-
-        with torch.no_grad():
-            x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_close(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4, rtol=1e-4)
-
-
-@require_torch
-class Jukebox5bModelTester(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-5b-lyrics"
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    }
-
-    # fmt: off
-    EXPECTED_OUTPUT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
-    ]
-    EXPECTED_OUTPUT_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
-    ]
-
-    EXPECTED_OUTPUT_1 = [
-        1125, 416, 1125, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
-    ]
-    EXPECTED_OUTPUT_1_PT_2 = [
-        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416
-    ]
-
-    EXPECTED_OUTPUT_0 = [
-        1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
-        616, 616, 616, 616, 307, 290, 417, 1755, 234, 1755, 185, 290,
-        290, 290, 307, 616, 616, 616, 616, 616, 290, 234, 234, 1755,
-        234, 234, 1755, 234, 185, 185, 307, 616, 616, 616, 616, 290,
-        1755, 1755, 1755, 234, 234, 1755, 1572, 290, 307, 616, 34, 616
-    ]
-    EXPECTED_OUTPUT_0_PT_2 = [
-        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842, 185,
-        417, 185, 842, 307, 842, 591, 842, 185, 842, 185, 842, 591, 842,
-        1353, 842, 185, 842, 591, 842, 591, 114, 591, 842, 185, 842, 591,
-        89, 591, 842, 591, 842, 591, 417, 1372, 842, 1372, 842, 34, 842,
-        185, 89, 591, 842, 185, 842, 591, 632
-    ]
-
-    EXPECTED_GPU_OUTPUTS_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
-    ]
-    EXPECTED_GPU_OUTPUTS_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 1853, 1177, 1536, 1228,
-        710, 475, 1489, 1229, 1224, 231, 1224, 252, 1434, 653, 475,
-        1106, 1877, 1599, 1228, 1600, 1683, 1182, 1853, 475, 1864,
-        252, 1229, 1434, 2001
-    ]
-
-    EXPECTED_GPU_OUTPUTS_1 = [
-        1125, 1125, 416, 1125, 1125, 416, 1125, 1125, 416, 416, 1125, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
-    ]
-    EXPECTED_GPU_OUTPUTS_0 = [
-        491, 1755, 34, 1613, 1755, 417, 992, 1613, 222, 842, 1353, 1613,
-        844, 632, 185, 1613, 844, 632, 185, 1613, 185, 842, 677, 1613,
-        185, 114, 1353, 1613, 307, 89, 844, 1613, 307, 1332, 234, 1979,
-        307, 89, 1353, 616, 34, 842, 185, 842, 34, 842, 185, 842,
-        307, 114, 185, 89, 34, 1268, 185, 89, 34, 842, 185, 89
-    ]
-    # fmt: on
-
-    def prepare_inputs(self, model_id):
-        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens
-
-    @slow
-    def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = self.prepare_inputs(self.model_id)
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
-
-    @slow
-    @require_torch_accelerator
-    @skip("Not enough GPU memory on CI runners")
-    def test_slow_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = [i.to(torch_device) for i in self.prepare_inputs(self.model_id)]
-
-        set_seed(0)
-        model.priors[0].to(torch_device)
-        zs = [torch.zeros(1, 0, dtype=torch.long).to(torch_device) for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
-        model.priors[0].cpu()
-
-        set_seed(0)
-        model.priors[1].to(torch_device)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
-        model.priors[1].cpu()
-
-        set_seed(0)
-        model.priors[2].to(torch_device)
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
-
-    @slow
-    @require_torch_accelerator
-    @require_torch_fp16
-    def test_fp16_slow_sampling(self):
-        prior_id = "ArthurZ/jukebox_prior_0"
-        model = JukeboxPrior.from_pretrained(prior_id, min_duration=0).eval().half().to(torch_device)
-
-        labels = self.prepare_inputs(prior_id)[0].to(torch_device)
-        metadata = model.get_metadata(labels, 0, 7680, 0)
-        set_seed(0)
-        outputs = model.sample(1, metadata=metadata, sample_tokens=60)
-        self.assertIn(outputs[0].cpu().tolist(), [self.EXPECTED_GPU_OUTPUTS_2, self.EXPECTED_GPU_OUTPUTS_2_PT_2])
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import JukeboxTokenizer
-from transformers.testing_utils import require_torch
-
-
-class JukeboxTokenizationTest(unittest.TestCase):
-    tokenizer_class = JukeboxTokenizer
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-        Who said "Two vast and trunkless legs of stone
-        Stand in the desert. . . . Near them, on the sand,
-        Half sunk a shattered visage lies, whose frown,
-        And wrinkled lip, and sneer of cold command,
-        Tell that its sculptor well those passions read
-        Which yet survive, stamped on these lifeless things,
-        The hand that mocked them, and the heart that fed;
-        And on the pedestal, these words appear:
-        My name is Ozymandias, King of Kings;
-        Look on my Works, ye Mighty, and despair!
-        Nothing beside remains. Round the decay
-        Of that colossal Wreck, boundless and bare
-        The lone and level sands stretch far away
-        """,
-    }
-
-    @require_torch
-    def test_1b_lyrics_tokenizer(self):
-        """
-        how to run the same test with openAI
-        ...
-        """
-        import torch
-
-        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-        tokens = tokenizer(**self.metas)["input_ids"]
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            torch.tensor([[
-                0, 0, 0, 7169, 507, 9, 76, 39, 31, 46, 76, 27,
-                76, 46, 44, 27, 48, 31, 38, 38, 31, 44, 76, 32,
-                44, 41, 39, 76, 27, 40, 76, 27, 40, 46, 35, 43,
-                47, 31, 76, 38, 27, 40, 30, 64, 78, 76, 76, 76,
-                76, 76, 76, 76, 76, 23, 34, 41, 76, 45, 27, 35,
-                30, 76, 71, 20, 49, 41, 76, 48, 27, 45, 46, 76,
-                27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45,
-                45, 76, 38, 31, 33, 45, 76, 41, 32, 76, 45, 46,
-                41, 40, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                19, 46, 27, 40, 30, 76, 35, 40, 76, 46, 34, 31,
-                76, 30, 31, 45, 31, 44, 46, 63, 76, 63, 76, 63,
-                76, 63, 76, 14, 31, 27, 44, 76, 46, 34, 31, 39,
-                64, 76, 41, 40, 76, 46, 34, 31, 76, 45, 27, 40,
-                30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76, 8,
-                27, 38, 32, 76, 45, 47, 40, 37, 76, 27, 76, 45,
-                34, 27, 46, 46, 31, 44, 31, 30, 76, 48, 35, 45,
-                27, 33, 31, 76, 38, 35, 31, 45, 64, 76, 49, 34,
-                41, 45, 31, 76, 32, 44, 41, 49, 40, 64, 78, 76,
-                76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76, 49,
-                44, 35, 40, 37, 38, 31, 30, 76, 38, 35, 42, 64,
-                76, 27, 40, 30, 76, 45, 40, 31, 31, 44, 76, 41,
-                32, 76, 29, 41, 38, 30, 76, 29, 41, 39, 39, 27,
-                40, 30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                20, 31, 38, 38, 76, 46, 34, 27, 46, 76, 35, 46,
-                45, 76, 45, 29, 47, 38, 42, 46, 41, 44, 76, 49,
-                31, 38, 38, 76, 46, 34, 41, 45, 31, 76, 42, 27,
-                45, 45, 35, 41, 40, 45, 76, 44, 31, 27, 30, 78,
-                76, 76, 76, 76, 76, 76, 76, 76, 23, 34, 35, 29,
-                34, 76, 51, 31, 46, 76, 45, 47, 44, 48, 35, 48,
-                31, 64, 76, 45, 46, 27, 39, 42, 31, 30, 76, 41,
-                40, 76, 46, 34, 31, 45, 31, 76, 38, 35, 32, 31,
-                38, 31, 45, 45, 76, 46, 34, 35, 40, 33, 45, 64,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 20, 34, 31,
-                76, 34, 27, 40, 30, 76, 46, 34, 27, 46, 76, 39,
-                41, 29, 37, 31, 30, 76, 46, 34, 31, 39, 64, 76,
-                27, 40, 30, 76, 46, 34, 31, 76, 34, 31, 27, 44,
-                46, 76, 46, 34, 27, 46, 76, 32, 31, 30, 66, 78,
-                76, 76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76,
-                41, 40, 76, 46, 34, 31, 76, 42, 31, 30, 31, 45,
-                46, 27, 38, 64, 76, 46, 34, 31, 45, 31, 76, 49,
-                41, 44, 30, 45, 76, 27, 42, 42, 31, 27, 44, 65,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 13, 51, 76,
-                40, 27, 39, 31, 76, 35, 45, 76, 15, 52, 51, 39,
-                27, 40, 30, 35, 27, 45, 64, 76, 11, 35, 40, 33,
-                76, 41, 32, 76, 11, 35, 40, 33, 45, 66, 78, 76,
-                76, 76, 76, 76, 76, 76, 76, 12, 41, 41, 37, 76,
-                41, 40, 76, 39, 51, 76, 23, 41, 44, 37, 45, 64,
-                76, 51, 31, 76, 13, 35, 33, 34, 46, 51, 64, 76,
-                27, 40, 30, 76, 30, 31, 45, 42, 27, 35, 44, 67,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 14, 41, 46,
-                34, 35, 40, 33, 76, 28, 31, 45, 35, 30, 31, 76,
-                44, 31, 39, 27, 35, 40, 45, 63, 76, 18, 41, 47,
-                40, 30, 76, 46, 34, 31, 76, 30, 31, 29, 27, 51,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 15, 32, 76,
-                46, 34, 27, 46, 76, 29, 41, 38, 41, 45, 45, 27,
-                38, 76, 23, 44, 31, 29, 37, 64, 76, 28, 41, 47,
-                40, 30, 38, 31, 45, 45, 76, 27, 40, 30, 76, 28,
-                27, 44, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                20, 34, 31, 76, 38, 41, 40, 31, 76, 27, 40, 30,
-                76, 38, 31, 48, 31, 38, 76, 45, 27, 40, 30, 45,
-                76, 45, 46, 44, 31, 46, 29, 34, 76, 32, 27, 44,
-                76, 27, 49, 27, 51, 78, 76, 76, 76, 76, 76, 76,
-                76, 76]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-        ]
-        # fmt: on
-        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
-        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
-        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
-
-    @require_torch
-    def test_5b_lyrics_tokenizer(self):
-        """
-        The outputs are similar that open AI but do not have the same format as this one is adapted to the HF integration.
-        """
-        import torch
-
-        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-5b-lyrics")
-        tokens = tokenizer(**self.metas)["input_ids"]
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            torch.tensor([[
-                0, 0, 0, 1069, 11, -1, -1, -1, -1, 9, 77, 39,
-                31, 46, 77, 27, 77, 46, 44, 27, 48, 31, 38, 38,
-                31, 44, 77, 32, 44, 41, 39, 77, 27, 40, 77, 27,
-                40, 46, 35, 43, 47, 31, 77, 38, 27, 40, 30, 64,
-                79, 77, 77, 77, 77, 77, 77, 77, 77, 23, 34, 41,
-                77, 45, 27, 35, 30, 77, 72, 20, 49, 41, 77, 48,
-                27, 45, 46, 77, 27, 40, 30, 77, 46, 44, 47, 40,
-                37, 38, 31, 45, 45, 77, 38, 31, 33, 45, 77, 41,
-                32, 77, 45, 46, 41, 40, 31, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 19, 46, 27, 40, 30, 77, 35, 40,
-                77, 46, 34, 31, 77, 30, 31, 45, 31, 44, 46, 63,
-                77, 63, 77, 63, 77, 63, 77, 14, 31, 27, 44, 77,
-                46, 34, 31, 39, 64, 77, 41, 40, 77, 46, 34, 31,
-                77, 45, 27, 40, 30, 64, 79, 77, 77, 77, 77, 77,
-                77, 77, 77, 8, 27, 38, 32, 77, 45, 47, 40, 37,
-                77, 27, 77, 45, 34, 27, 46, 46, 31, 44, 31, 30,
-                77, 48, 35, 45, 27, 33, 31, 77, 38, 35, 31, 45,
-                64, 77, 49, 34, 41, 45, 31, 77, 32, 44, 41, 49,
-                40, 64, 79, 77, 77, 77, 77, 77, 77, 77, 77, 1,
-                40, 30, 77, 49, 44, 35, 40, 37, 38, 31, 30, 77,
-                38, 35, 42, 64, 77, 27, 40, 30, 77, 45, 40, 31,
-                31, 44, 77, 41, 32, 77, 29, 41, 38, 30, 77, 29,
-                41, 39, 39, 27, 40, 30, 64, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 20, 31, 38, 38, 77, 46, 34, 27,
-                46, 77, 35, 46, 45, 77, 45, 29, 47, 38, 42, 46,
-                41, 44, 77, 49, 31, 38, 38, 77, 46, 34, 41, 45,
-                31, 77, 42, 27, 45, 45, 35, 41, 40, 45, 77, 44,
-                31, 27, 30, 79, 77, 77, 77, 77, 77, 77, 77, 77,
-                23, 34, 35, 29, 34, 77, 51, 31, 46, 77, 45, 47,
-                44, 48, 35, 48, 31, 64, 77, 45, 46, 27, 39, 42,
-                31, 30, 77, 41, 40, 77, 46, 34, 31, 45, 31, 77,
-                38, 35, 32, 31, 38, 31, 45, 45, 77, 46, 34, 35,
-                40, 33, 45, 64, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 20, 34, 31, 77, 34, 27, 40, 30, 77, 46, 34,
-                27, 46, 77, 39, 41, 29, 37, 31, 30, 77, 46, 34,
-                31, 39, 64, 77, 27, 40, 30, 77, 46, 34, 31, 77,
-                34, 31, 27, 44, 46, 77, 46, 34, 27, 46, 77, 32,
-                31, 30, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77,
-                1, 40, 30, 77, 41, 40, 77, 46, 34, 31, 77, 42,
-                31, 30, 31, 45, 46, 27, 38, 64, 77, 46, 34, 31,
-                45, 31, 77, 49, 41, 44, 30, 45, 77, 27, 42, 42,
-                31, 27, 44, 65, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 13, 51, 77, 40, 27, 39, 31, 77, 35, 45, 77,
-                15, 52, 51, 39, 27, 40, 30, 35, 27, 45, 64, 77,
-                11, 35, 40, 33, 77, 41, 32, 77, 11, 35, 40, 33,
-                45, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77, 12,
-                41, 41, 37, 77, 41, 40, 77, 39, 51, 77, 23, 41,
-                44, 37, 45, 64, 77, 51, 31, 77, 13, 35, 33, 34,
-                46, 51, 64, 77, 27, 40, 30, 77, 30, 31, 45, 42,
-                27, 35, 44, 67, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 14, 41, 46, 34, 35, 40, 33, 77, 28, 31, 45,
-                35, 30, 31, 77, 44, 31, 39, 27, 35, 40, 45, 63,
-                77, 18, 41, 47, 40, 30, 77, 46, 34, 31, 77, 30,
-                31, 29, 27, 51, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 15, 32, 77, 46, 34, 27, 46, 77, 29, 41, 38,
-                41, 45, 45, 27, 38, 77, 23, 44, 31, 29, 37, 64,
-                77, 28, 41, 47, 40, 30, 38, 31, 45, 45, 77, 27,
-                40, 30, 77, 28, 27, 44, 31, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 20, 34, 31, 77, 38, 41, 40, 31,
-                77, 27, 40, 30, 77, 38, 31, 48, 31, 38, 77, 45,
-                27, 40, 30, 45, 77, 45, 46, 44, 31, 46, 29, 34,
-                77, 32, 27, 44, 77, 27, 49, 27, 51, 79, 77, 77,
-                77, 77, 77, 77, 77, 77]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-        ]
-        # fmt: on
-        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
-        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
-        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
--- a/tests/models/mega/__init__.py
+++ b/tests/models/mega/__init__.py
--- a/tests/models/mega/test_modeling_mega.py
+++ b/tests/models/mega/test_modeling_mega.py
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import MegaConfig, is_torch_available
-from transformers.testing_utils import (
-    TestCasePlus,
-    is_flaky,
-    require_torch,
-    require_torch_fp16,
-    slow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MegaForCausalLM,
-        MegaForMaskedLM,
-        MegaForMultipleChoice,
-        MegaForQuestionAnswering,
-        MegaForSequenceClassification,
-        MegaForTokenClassification,
-        MegaModel,
-    )
-
-
-class MegaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_positions=1024,
-        bidirectional=False,  # needed for decoding, and can't modify common generation tests; test separately by overriding
-        ema_projection_size=16,
-        shared_representation_size=64,
-        use_chunking=False,
-        chunk_size=32,
-        attention_activation="softmax",
-        use_normalized_ffn=True,
-        nffn_hidden_size=24,
-        add_token_type_embeddings=True,
-        type_vocab_size=2,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.add_token_type_embeddings = add_token_type_embeddings
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_positions = max_positions
-        self.bidirectional = bidirectional
-        self.ema_projection_size = ema_projection_size
-        self.shared_representation_size = shared_representation_size
-        self.use_chunking = use_chunking
-        self.chunk_size = chunk_size
-        self.attention_activation = attention_activation
-        self.use_normalized_ffn = use_normalized_ffn
-        self.nffn_hidden_size = nffn_hidden_size
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.num_attention_heads = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.add_token_type_embeddings:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MegaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            # added args
-            add_token_type_embeddings=self.add_token_type_embeddings,
-            max_positions=self.max_positions,
-            bidirectional=self.bidirectional,
-            ema_projection_size=self.ema_projection_size,
-            shared_representation_size=self.shared_representation_size,
-            use_chunking=self.use_chunking,
-            chunk_size=self.chunk_size,
-            attention_activation=self.attention_activation,
-            use_normalized_ffn=self.use_normalized_ffn,
-            nffn_hidden_size=self.nffn_hidden_size,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        config.bidirectional = False
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MegaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.bidirectional = False
-        config.add_cross_attention = True
-        model = MegaForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 1), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_with_chunking(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.use_chunking = True
-        config.output_attentions = True
-        config.attention_activation = "laplace"
-        config.chunk_size = input_ids.size(1) * 2
-
-        model = MegaForCausalLM(config).to(torch_device).eval()
-
-        input_ids = input_ids.repeat(1, 8)
-        # multiply the sequence length by 8 since we repeat the same ids 8 times in input_ids
-        input_mask = random_attention_mask([self.batch_size, self.seq_length * 8])
-
-        result = model(input_ids, attention_mask=input_mask)
-
-        # test if the sequence length of attentions is same provided chunk_size
-        self.parent.assertEqual(result["attentions"][0].shape[-1], config.chunk_size)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MegaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = MegaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    # extra checks for Mega-specific model functionality
-    def create_and_check_bidirectionality(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.bidirectional = True
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-        # no mask
-        result = model(input_ids)
-        # with mask & token types
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_chunking_shorter_sequence(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.use_chunking = True
-        config.chunk_size = input_ids.size(1) + 25
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_chunking_longer_sequence(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.use_chunking = True
-
-        # we want the chunk size to be < sequence length, and the sequence length to be a multiple of chunk size
-        config.chunk_size = input_ids.size(1) * 2
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(
-            input_ids.repeat(1, 8),
-        )
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length * 8, self.hidden_size))
-
-    def check_laplace_self_attention(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.attention_activation = "laplace"
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_relu2_self_attention(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.attention_activation = "relu2"
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_sequence_length_beyond_max_positions(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.max_positions = self.seq_length - 2
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class MegaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            MegaForCausalLM,
-            MegaForMaskedLM,
-            MegaModel,
-            MegaForSequenceClassification,
-            MegaForTokenClassification,
-            MegaForMultipleChoice,
-            MegaForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (MegaForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MegaModel,
-            "fill-mask": MegaForMaskedLM,
-            "question-answering": MegaForQuestionAnswering,
-            "text-classification": MegaForSequenceClassification,
-            "text-generation": MegaForCausalLM,
-            "token-classification": MegaForTokenClassification,
-            "zero-shot": MegaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = MegaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MegaConfig, hidden_size=37)
-
-    # TODO: @ydshieh
-    @is_flaky(description="Sometimes gives `AssertionError` on expected outputs")
-    def test_pipeline_fill_mask(self):
-        super().test_pipeline_fill_mask()
-
-    # TODO: @ydshieh
-    @is_flaky(
-        description="Sometimes gives `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`"
-    )
-    def test_pipeline_text_generation(self):
-        super().test_pipeline_text_generation()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_with_chunking(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_with_chunking(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_bidirectionality(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bidirectionality(*config_and_inputs)
-
-    def test_for_chunking_shorter_sequence(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_chunking_shorter_sequence(*config_and_inputs)
-
-    def test_for_chunking_longer_sequence(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_chunking_longer_sequence(*config_and_inputs)
-
-    def test_for_laplace_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_laplace_self_attention(*config_and_inputs)
-
-    def test_for_relu2_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_relu2_self_attention(*config_and_inputs)
-
-    def test_for_sequence_length_beyond_max_positions(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_sequence_length_beyond_max_positions(*config_and_inputs)
-
-    @require_torch_fp16
-    def test_generate_fp16(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs_for_decoder()
-        # attention_mask = torch.LongTensor(input_ids.ne(1)).to(torch_device)
-        model = MegaForCausalLM(config).eval().to(torch_device)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_sequence_classification_model(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = self.model_tester.num_labels
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MegaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_sequence_classification_model_for_multi_label(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = self.model_tester.num_labels
-        config.problem_type = "multi_label_classification"
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = MegaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "mnaylor/mega-base-wikitext"
-        model = MegaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        super().test_cpu_offload()
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        super().test_disk_offload()
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(
-        reason=(
-            "Calling `self.attention_function` in `MegaMovingAverageGatedAttention.forward` changes the submodules on "
-            "device 1 to device 0 (also changes `requires_grad`). No idea how this could happen for now."
-        )
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_simple(self):
-        super().test_torchscript_simple()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_output_hidden_state(self):
-        super().test_torchscript_output_hidden_state()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_output_attentions(self):
-        super().test_torchscript_output_attentions()
-
-
-@require_torch
-class MegaModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = MegaForMaskedLM.from_pretrained("mnaylor/mega-base-wikitext")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[67.8389, 10.1470, -32.7148], [-11.1655, 29.1152, 23.1304], [-3.8015, 66.0397, 29.6733]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = MegaModel.from_pretrained("mnaylor/mega-base-wikitext")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 128))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice. taken from output[:, :3, :3]
-        expected_slice = torch.tensor(
-            [[[1.1767, -0.6349, 2.8494], [-0.5109, -0.7745, 1.9495], [-0.3287, -0.2111, 3.3367]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
--- a/tests/models/nat/__init__.py
+++ b/tests/models/nat/__init__.py
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Nat model."""
-
-import collections
-import unittest
-
-from transformers import NatConfig
-from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import NatBackbone, NatForImageClassification, NatModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class NatModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=4,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 4, 8],
-        kernel_size=3,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        num_labels=10,
-        out_features=["stage1", "stage2"],
-        out_indices=[1, 2],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.kernel_size = kernel_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return NatConfig(
-            num_labels=self.num_labels,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            kernel_size=self.kernel_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            patch_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = NatModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = NatForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = NatForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = NatBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = NatBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_natten
-@require_torch
-class NatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            NatModel,
-            NatForImageClassification,
-            NatBackbone,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": NatModel, "image-classification": NatForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_torchscript = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = NatModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NatConfig, embed_dim=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    @unittest.skip(reason="Nat does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Nat does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_attention_outputs(self):
-        self.skipTest("Nat's attention operation is handled entirely by NATTEN.")
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # Nat has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        height = image_size[0] // patch_size[0]
-        width = image_size[1] // patch_size[1]
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-3:]),
-            [height, width, self.model_tester.embed_dim],
-        )
-
-        if model_class.__name__ != "NatBackbone":
-            reshaped_hidden_states = outputs.reshaped_hidden_states
-            self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-            batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-            reshaped_hidden_states = (
-                reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
-            )
-            self.assertListEqual(
-                list(reshaped_hidden_states.shape[-3:]),
-                [height, width, self.model_tester.embed_dim],
-            )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "shi-labs/nat-mini-in1k-224"
-        model = NatModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-@require_natten
-@require_vision
-@require_torch
-class NatModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-
-@require_torch
-@require_natten
-class NatBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (NatBackbone,) if is_torch_available() else ()
-    config_class = NatConfig
-
-    def setUp(self):
-        self.model_tester = NatModelTester(self)
--- a/tests/models/nezha/__init__.py
+++ b/tests/models/nezha/__init__.py
--- a/tests/models/nezha/test_modeling_nezha.py
+++ b/tests/models/nezha/test_modeling_nezha.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-from transformers import NezhaConfig, is_torch_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        NezhaForMaskedLM,
-        NezhaForMultipleChoice,
-        NezhaForNextSentencePrediction,
-        NezhaForPreTraining,
-        NezhaForQuestionAnswering,
-        NezhaForSequenceClassification,
-        NezhaForTokenClassification,
-        NezhaModel,
-    )
-
-
-class NezhaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=128,
-        max_relative_position=32,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return NezhaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = NezhaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NezhaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NezhaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = NezhaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            NezhaModel,
-            NezhaForMaskedLM,
-            NezhaForMultipleChoice,
-            NezhaForNextSentencePrediction,
-            NezhaForPreTraining,
-            NezhaForQuestionAnswering,
-            NezhaForSequenceClassification,
-            NezhaForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": NezhaModel,
-            "fill-mask": NezhaForMaskedLM,
-            "question-answering": NezhaForQuestionAnswering,
-            "text-classification": NezhaForSequenceClassification,
-            "token-classification": NezhaForTokenClassification,
-            "zero-shot": NezhaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-                inputs_dict["next_sentence_label"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = NezhaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NezhaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "sijunhe/nezha-cn-base"
-        model = NezhaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    @require_torch_gpu
-    def test_torchscript_device_change(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # NezhaForMultipleChoice behaves incorrectly in JIT environments.
-            if model_class == NezhaForMultipleChoice:
-                return
-
-            config.torchscript = True
-            model = model_class(config=config)
-
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            traced_model = torch.jit.trace(
-                model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu"))
-            )
-
-            with tempfile.TemporaryDirectory() as tmp:
-                torch.jit.save(traced_model, os.path.join(tmp, "bert.pt"))
-                loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
-                loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
-
-
-@require_torch
-class NezhaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_nezha_model(self):
-        model = NezhaModel.from_pretrained("sijunhe/nezha-cn-base")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 6, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor([[[0.0685, 0.2441, 0.1102], [0.0600, 0.1906, 0.1349], [0.0221, 0.0819, 0.0586]]])
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_nezha_masked_lm(self):
-        model = NezhaForMaskedLM.from_pretrained("sijunhe/nezha-cn-base")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 6, 21128))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[-2.7939, -1.7902, -2.2189], [-2.8585, -1.8908, -2.3723], [-2.6499, -1.7750, -2.2558]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
--- a/tests/models/qdqbert/__init__.py
+++ b/tests/models/qdqbert/__init__.py