Remove add-new-model in favor of add-new-model-like (#30424)

* Remove add-new-model in favor of add-new-model-like * nits

Remove add-new-model in favor of add-new-model-like (#30424)
* Remove add-new-model in favor of add-new-model-like * nits
d4e92f1a · Lysandre Debut · GitHub · 0eb8fbcd · 0eb8fbcd · 0eb8fbcd
Unverified Commit d4e92f1a authored Apr 24, 2024 by Lysandre Debut Committed by GitHub Apr 24, 2024
16 changed files
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-
-import unittest
-
-from transformers import is_tf_available, {{cookiecutter.camelcase_modelname}}Config
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
-        TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
-        TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-        TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-        TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-        TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
-        TF{{cookiecutter.camelcase_modelname}}Model,
-    )
-
-
-class TF{{cookiecutter.camelcase_modelname}}ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.is_training = True
-        self.use_input_mask = True
-        self.use_token_type_ids = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.hidden_size = 32
-        self.num_hidden_layers = 5
-        self.num_attention_heads = 4
-        self.intermediate_size = 37
-        self.hidden_act = "gelu"
-        self.hidden_dropout_prob = 0.1
-        self.attention_probs_dropout_prob = 0.1
-        self.max_position_embeddings = 512
-        self.type_vocab_size = 16
-        self.type_sequence_label_size = 2
-        self.initializer_range = 0.02
-        self.num_labels = 3
-        self.num_choices = 4
-        self.scope = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = {{cookiecutter.camelcase_modelname}}Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            return_dict=True,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_base_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        # Also check the case where encoder outputs are not passed
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_causal_lm_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.is_decoder = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        prediction_scores = model(inputs)["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-    def create_and_check_causal_lm_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-            "encoder_hidden_states": encoder_hidden_states,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-        result = model(inputs)
-
-        inputs = [input_ids, input_mask]
-        result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
-
-        prediction_scores = result["logits"]
-        self.parent.assertListEqual(
-            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
-        )
-
-
-    def create_and_check_causal_lm_model_past(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and attn_mask
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_with_attn_mask(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
-        # create attention mask
-        half_seq_length = self.seq_length // 2
-        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
-        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
-        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        past_key_values = outputs.past_key_values
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
-        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
-        condition = tf.transpose(
-            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
-        )
-        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        attn_mask = tf.concat(
-            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
-            axis=1,
-        )
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=attn_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True
-        ).hidden_states[0]
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
-
-    def create_and_check_causal_lm_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-
-        model = TF{{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-
-        input_ids = input_ids[:1, :]
-        input_mask = input_mask[:1, :]
-        encoder_hidden_states = encoder_hidden_states[:1, :, :]
-        encoder_attention_mask = encoder_attention_mask[:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        ).hidden_states[0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        ).hidden_states[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-        result = model(inputs)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
-        inputs = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-        result = model(inputs)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            TF{{cookiecutter.camelcase_modelname}}Model,
-            TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
-            TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
-            TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
-            TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-        )
-        if is_tf_available()
-        else ()
-    )
-
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        """Test the base model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Template classes interact badly with this test.")
-    def test_keras_fit(self):
-        pass
-
-    def test_causal_lm_base_model(self):
-        """Test the base model of the causal LM model
-
-        is_deocder=True, no cross_attention, no encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        """Test the base model as a decoder (of an encoder-decoder architecture)
-
-        is_deocder=True + cross_attention + pass encoder outputs
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        """Test the causal LM model"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model(*config_and_inputs)
-
-    def test_causal_lm_model_as_decoder(self):
-        """Test the causal LM model as a decoder"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_causal_lm_model_as_decoder(*config_and_inputs)
-
-    def test_causal_lm_model_past(self):
-        """Test causal LM model with `past_key_values`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_attn_mask(self):
-        """Test the causal LM model with `past_key_values` and `attention_mask`"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_with_attn_mask(*config_and_inputs)
-
-    def test_causal_lm_model_past_with_large_inputs(self):
-        """Test the causal LM model with `past_key_values` and a longer decoder sequence length"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        """Similar to `test_causal_lm_model_past_with_large_inputs` but with cross-attention"""
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
-        self.assertIsNotNone(model)
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TF{{cookiecutter.camelcase_modelname}}ForMaskedLM.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
-        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        vocab_size = 32000
-
-        expected_shape = [1, 6, vocab_size]
-        self.assertEqual(output.shape, expected_shape)
-
-        print(output[:, :3, :3])
-
-        # TODO Replace values below with what was printed above.
-        expected_slice = tf.constant(
-            [
-                [
-                    [-0.05243197, -0.04498899, 0.05512108],
-                    [-0.07444685, -0.01064632, 0.04352357],
-                    [-0.05020351, 0.05530146, 0.00700043],
-                ]
-            ]
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
-
-{% else %}
-import unittest
-
-from transformers import (
-    is_tf_available,
-    {{cookiecutter.camelcase_modelname}}Config,
-    {{cookiecutter.camelcase_modelname}}Tokenizer,
-)
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
-        TF{{cookiecutter.camelcase_modelname}}Model,
-    )
-
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelTester:
-    config_cls = {{cookiecutter.camelcase_modelname}}Config
-    config_updates = {}
-    hidden_act = "gelu"
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
-        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
-        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.config_cls(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_ids=[2],
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.pad_token_id,
-            **self.config_updates,
-        )
-        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = TF{{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder()
-        input_ids = inputs_dict["input_ids"]
-
-        input_ids = input_ids[:1, :]
-        attention_mask = inputs_dict["attention_mask"][:1, :]
-        self.batch_size = 1
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
-
-        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
-
-        # select random slice
-        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
-
-
-def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int32)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = tf.concat([tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int32), tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int32)], axis=-1)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-    }
-
-
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, TF{{cookiecutter.camelcase_modelname}}Model) if is_tf_available() else ()
-    all_generative_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_tf_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    @unittest.skip(reason="Template classes interact badly with this test.")
-    def test_keras_fit(self):
-        pass
-
-
-def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
-    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if tf.debugging.assert_near(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        if len(prefix) > 0:
-            prefix = f"{prefix}: "
-        raise AssertionError(f"{prefix}{a} != {b}")
-
-
-def _long_tensor(tok_lst):
-    return tf.constant(tok_lst, dtype=tf.int32)
-
-
-TOLERANCE = 1e-4
-
-
-@slow
-@require_sentencepiece
-@require_tokenizers
-@require_tf
-class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
-    def test_inference_no_head(self):
-        model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-        # change to intended input here
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 11, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = tf.Tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
-
-    def test_inference_with_head(self):
-        model = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-        # change to intended input here
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 11, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = tf.Tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
-        )
-        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
-
-    def test_seq_to_seq_generation(self):
-        hf = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-        tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
-        batch_input = [
-            # string 1,
-            # string 2,
-            # string 3,
-            # string 4,
-        ]
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        dct = tok.batch_encode_plus(
-            batch_input,
-            max_length=512,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="tf",
-        )
-
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=2,
-        )
-
-        EXPECTED = [
-            # here expected 1,
-            # here expected 2,
-            # here expected 3,
-            # here expected 4,
-        ]
-
-        generated = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == EXPECTED
-{%- endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch {{cookiecutter.modelname}} model. """
-
-
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-import unittest
-
-from ...test_modeling_common import floats_tensor
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from transformers import {{cookiecutter.camelcase_modelname}}Config
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        {{cookiecutter.camelcase_modelname}}ForCausalLM,
-        {{cookiecutter.camelcase_modelname}}ForMaskedLM,
-        {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-        {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-        {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-        {{cookiecutter.camelcase_modelname}}ForTokenClassification,
-        {{cookiecutter.camelcase_modelname}}Model,
-    )
-    from transformers.models.{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
-        {{cookiecutter.uppercase_modelname}}    )
-
-
-class {{cookiecutter.camelcase_modelname}}ModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return {{cookiecutter.camelcase_modelname}}Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = {{cookiecutter.camelcase_modelname}}Model(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = {{cookiecutter.camelcase_modelname}}Model(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-    ):
-        model = {{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = {{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = {{cookiecutter.camelcase_modelname}}ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = {{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = {{cookiecutter.camelcase_modelname}}ForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = {{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = {{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            {{cookiecutter.camelcase_modelname}}Model,
-            {{cookiecutter.camelcase_modelname}}ForMaskedLM,
-            {{cookiecutter.camelcase_modelname}}ForCausalLM,
-            {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-            {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            {{cookiecutter.camelcase_modelname}}ForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForCausalLM,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "{{coockiecutter.checkpoint_identifier}}"
-        model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = {{cookiecutter.camelcase_modelname}}ForMaskedLM.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        vocab_size = 32000
-
-        expected_shape = torch.Size((1, 6, vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        # TODO Replace values below with what was printed above.
-        expected_slice = torch.tensor(
-            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-
-{% else -%}
-import copy
-import tempfile
-import unittest
-
-from transformers import is_torch_available
-from transformers.utils import cached_property
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        {{cookiecutter.camelcase_modelname}}Config,
-        {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
-        {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-        {{cookiecutter.camelcase_modelname}}ForCausalLM,
-        {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-        {{cookiecutter.camelcase_modelname}}Model,
-        {{cookiecutter.camelcase_modelname}}Tokenizer,
-    )
-    from transformers.models.{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
-        {{cookiecutter.camelcase_modelname}}Decoder,
-        {{cookiecutter.camelcase_modelname}}Encoder,
-    )
-
-
-def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = {{cookiecutter.camelcase_modelname}}Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = {{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder().to(torch_device).eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = {{cookiecutter.camelcase_modelname}}Model(config=config).to(torch_device).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = {{cookiecutter.camelcase_modelname}}Encoder.from_pretrained(tmpdirname).to(torch_device)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = {{cookiecutter.camelcase_modelname}}Decoder.from_pretrained(tmpdirname).to(torch_device)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForSequenceClassification, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering)
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_torch_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    # {{cookiecutter.camelcase_modelname}}ForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration(config).eval().to(torch_device)
-        if torch_device == "cuda":
-            model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if torch.allclose(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
-
-
-TOLERANCE = 1e-4
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-@slow
-class {{cookiecutter.camelcase_modelname}}ModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
-    def test_inference_no_head(self):
-        model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
-        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
-
-        # change to intended input
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, model.config.vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_seq_to_seq_generation(self):
-        hf = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
-        tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
-
-        batch_input = [
-            # string 1,
-            # string 2,
-            # string 3,
-            # string 4,
-        ]
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        dct = tok.batch_encode_plus(
-            batch_input,
-            max_length=512,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"].to(torch_device),
-            attention_mask=dct["attention_mask"].to(torch_device),
-            num_beams=2,
-        )
-
-        EXPECTED = [
-            # here expected 1,
-            # here expected 2,
-            # here expected 3,
-            # here expected 4,
-        ]
-
-        generated = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == EXPECTED
-
-
-class {{cookiecutter.camelcase_modelname}}StandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=4,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = {{cookiecutter.camelcase_modelname}}Config(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = {{cookiecutter.camelcase_modelname}}Decoder(config=config).to(torch_device).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = {{cookiecutter.camelcase_modelname}}Decoder(config=config).to(torch_device).eval()
-
-        # create attention mask
-        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class {{cookiecutter.camelcase_modelname}}StandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = ({{cookiecutter.camelcase_modelname}}Decoder, {{cookiecutter.camelcase_modelname}}ForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForCausalLM,) if is_torch_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = {{cookiecutter.camelcase_modelname}}StandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
-{% endif -%}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py
-## Copyright 2022 The HuggingFace Team. All rights reserved.
-##
-## Licensed under the Apache License, Version 2.0 (the "License");
-## you may not use this file except in compliance with the License.
-## You may obtain a copy of the License at
-##
-##     http://www.apache.org/licenses/LICENSE-2.0
-##
-## Unless required by applicable law or agreed to in writing, software
-## distributed under the License is distributed on an "AS IS" BASIS,
-## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-## See the License for the specific language governing permissions and
-## limitations under the License.
-
-## This file is made so that specific statements may be copied inside existing files. This is useful to copy
-## import statements in __init__.py, or to complete model lists in the AUTO files.
-##
-## It is to be used as such:
-## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
-## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurrence** of that line in the file at FILE_PATH
-## Put '# Replace with:' followed by the lines containing the content to define the content
-## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
-## content in that file.
-##
-## Put '## COMMENT' to comment on the file.
-
-# To replace in: "src/transformers/__init__.py"
-# Below: "    # PyTorch models structure" if generating PyTorch
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
-        [
-            "{{cookiecutter.camelcase_modelname}}ForMaskedLM",
-            "{{cookiecutter.camelcase_modelname}}ForCausalLM",
-            "{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
-            "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
-            "{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
-            "{{cookiecutter.camelcase_modelname}}ForTokenClassification",
-            "{{cookiecutter.camelcase_modelname}}Layer",
-            "{{cookiecutter.camelcase_modelname}}Model",
-            "{{cookiecutter.camelcase_modelname}}PreTrainedModel",
-            "load_tf_weights_in_{{cookiecutter.lowercase_modelname}}",
-        ]
-    )
-{% else %}
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
-        [
-            "{{cookiecutter.camelcase_modelname}}ForCausalLM",
-            "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
-            "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
-            "{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
-            "{{cookiecutter.camelcase_modelname}}Model",
-            "{{cookiecutter.camelcase_modelname}}PreTrainedModel",
-        ]
-    )
-{% endif -%}
-# End.
-
-# Below: "    # TensorFlow models structure" if generating TensorFlow
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
-        [
-            "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM",
-            "TF{{cookiecutter.camelcase_modelname}}ForCausalLM",
-            "TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
-            "TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
-            "TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
-            "TF{{cookiecutter.camelcase_modelname}}ForTokenClassification",
-            "TF{{cookiecutter.camelcase_modelname}}Layer",
-            "TF{{cookiecutter.camelcase_modelname}}Model",
-            "TF{{cookiecutter.camelcase_modelname}}PreTrainedModel",
-        ]
-    )
-{% else %}
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
-        [
-            "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
-            "TF{{cookiecutter.camelcase_modelname}}Model",
-            "TF{{cookiecutter.camelcase_modelname}}PreTrainedModel",
-        ]
-    )
-{% endif -%}
-# End.
-
-# Below: "    # Flax models structure" if generating Flax
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
-        [
-            "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM",
-            "Flax{{cookiecutter.camelcase_modelname}}ForCausalLM",
-            "Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice",
-            "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
-            "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
-            "Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification",
-            "Flax{{cookiecutter.camelcase_modelname}}Layer",
-            "Flax{{cookiecutter.camelcase_modelname}}Model",
-            "Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel",
-        ]
-    )
-{% else %}
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].extend(
-        [
-            "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration",
-            "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering",
-            "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification",
-            "Flax{{cookiecutter.camelcase_modelname}}Model",
-            "Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel",
-        ]
-    )
-{% endif -%}
-# End.
-
-# Below: "    # Fast tokenizers structure"
-# Replace with:
-    _import_structure["models.{{cookiecutter.lowercase_modelname}}"].append("{{cookiecutter.camelcase_modelname}}TokenizerFast")
-# End.
-
-# Below: "    # Models"
-# Replace with:
-    "models.{{cookiecutter.lowercase_modelname}}": ["{{cookiecutter.camelcase_modelname}}Config", "{{cookiecutter.camelcase_modelname}}Tokenizer"],
-# End.
-
-# To replace in: "src/transformers/__init__.py"
-# Below: "        # PyTorch model imports" if generating PyTorch
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-        from .models.{{cookiecutter.lowercase_modelname}} import (
-            {{cookiecutter.camelcase_modelname}}ForMaskedLM,
-            {{cookiecutter.camelcase_modelname}}ForCausalLM,
-            {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-            {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            {{cookiecutter.camelcase_modelname}}ForTokenClassification,
-            {{cookiecutter.camelcase_modelname}}Layer,
-            {{cookiecutter.camelcase_modelname}}Model,
-            {{cookiecutter.camelcase_modelname}}PreTrainedModel,
-            load_tf_weights_in_{{cookiecutter.lowercase_modelname}},
-        )
-{% else %}
-        from .models.{{cookiecutter.lowercase_modelname}} import (
-            {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
-            {{cookiecutter.camelcase_modelname}}ForCausalLM,
-            {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            {{cookiecutter.camelcase_modelname}}Model,
-            {{cookiecutter.camelcase_modelname}}PreTrainedModel,
-        )
-{% endif -%}
-# End.
-
-# Below: "        # TensorFlow model imports" if generating TensorFlow
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-        from .models.{{cookiecutter.lowercase_modelname}} import (
-            TF_{{cookiecutter.uppercase_modelname}}            TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
-            TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
-            TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-            TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
-            TF{{cookiecutter.camelcase_modelname}}Layer,
-            TF{{cookiecutter.camelcase_modelname}}Model,
-            TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
-        )
-{% else %}
-        from .models.{{cookiecutter.lowercase_modelname}} import (
-            TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
-            TF{{cookiecutter.camelcase_modelname}}Model,
-            TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
-        )
-{% endif -%}
-# End.
-
-# Below: "        # Flax model imports" if generating Flax
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-        from .models.{{cookiecutter.lowercase_modelname}} import (
-            Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM,
-            Flax{{cookiecutter.camelcase_modelname}}ForCausalLM,
-            Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
-            Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification,
-            Flax{{cookiecutter.camelcase_modelname}}Layer,
-            Flax{{cookiecutter.camelcase_modelname}}Model,
-            Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel,
-        )
-{% else %}
-        from .models.{{cookiecutter.lowercase_modelname}} import (
-            Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
-            Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
-            Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
-            Flax{{cookiecutter.camelcase_modelname}}Model,
-            Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel,
-        )
-{% endif -%}
-# End.
-
-# Below: "        # Fast tokenizers imports"
-# Replace with:
-        from .models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}TokenizerFast
-# End.
-
-# Below: "    from .models.albert import AlbertConfig"
-# Replace with:
-    from .models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}{{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}Tokenizer
-# End.
-
-
-
-# To replace in: "src/transformers/models/__init__.py"
-# Below: "from . import ("
-# Replace with:
-    {{cookiecutter.lowercase_modelname}},
-# End.
-
-
-# To replace in: "src/transformers/models/auto/configuration_auto.py"
-# Below: "# Add configs here"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}Config"),
-# End.
-
-# Below: "# Add full (and cased) model names here"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}"),
-# End.
-
-
-
-# To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
-# Below: "# Base model mapping"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}Model"),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForCausalLM"),
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForTokenClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForMultipleChoice"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
-# Below: "# Base model mapping"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}Model"),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForCausalLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForTokenClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# To replace in: "src/transformers/models/auto/modeling_flax_auto.py" if generating Flax
-# Below: "# Base model mapping"
-# Replace with:
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}Model"),
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForCausalLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering"),
-{% endif -%}
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice"),
-{% else -%}
-{% endif -%}
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else %}
-        ("{{cookiecutter.lowercase_modelname}}", "Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration"),
-{% endif -%}
-# End.
-
-
-
-# To replace in: "utils/check_repo.py" if generating PyTorch
-
-# Below: "models to ignore for model xxx mapping"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else -%}
-    "{{cookiecutter.camelcase_modelname}}Encoder",
-    "{{cookiecutter.camelcase_modelname}}Decoder",
-    "{{cookiecutter.camelcase_modelname}}DecoderWrapper",
-{% endif -%}
-# End.
-
-# Below: "models to ignore for not tested"
-# Replace with:
-{% if cookiecutter.is_encoder_decoder_model == "False" -%}
-{% else -%}
-    "{{cookiecutter.camelcase_modelname}}Encoder",  # Building part of bigger (tested) model.
-    "{{cookiecutter.camelcase_modelname}}Decoder",  # Building part of bigger (tested) model.
-    "{{cookiecutter.camelcase_modelname}}DecoderWrapper", # Building part of bigger (tested) model.
-{% endif -%}
-# End.
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for {{cookiecutter.modelname}}."""
-
-{%- if cookiecutter.tokenizer_type == "Based on BERT" %}
-from ...utils import logging
-from ..bert.tokenization_bert_fast import BertTokenizerFast
-from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
-    }
-}
-
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
-    r"""
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
-    [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BertTokenizerFast`] and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
-
-{%- elif cookiecutter.tokenizer_type == "Based on BART" %}
-from ...utils import logging
-from ..bart.tokenization_bart_fast import BartTokenizerFast
-from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
-    r"""
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
-    [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BartTokenizerFast`] and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
-
-{%- elif cookiecutter.tokenizer_type == "Standalone" %}
-from typing import List, Optional
-
-from tokenizers import ByteLevelBPETokenizer
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
-
-    def __init__(
-            self,
-            vocab_file,
-            merges_file,
-            unk_token="<|endoftext|>",
-            bos_token="<|endoftext|>",
-            eos_token="<|endoftext|>",
-            add_prefix_space=False,
-            trim_offsets=True,
-            **kwargs
-    ):
-        super().__init__(
-            ByteLevelBPETokenizer(
-                vocab_file=vocab_file,
-                merges_file=merges_file,
-                add_prefix_space=add_prefix_space,
-                trim_offsets=trim_offsets,
-            ),
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            **kwargs,
-        )
-        self.add_prefix_space = add_prefix_space
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-
-    def create_token_type_ids_from_sequences(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
-        {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`:  List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-
-{% endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
-# coding=utf-8
-# Copyright 2022 {{cookiecutter.authors}} and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for {{cookiecutter.modelname}}."""
-
-{%- if cookiecutter.tokenizer_type == "Based on BERT" %}
-from ...utils import logging
-from ..bert.tokenization_bert import BertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
-    }
-}
-
-
-class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer):
-    r"""
-    Construct a {{cookiecutter.modelname}} tokenizer.
-
-    [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
-    tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-
-{%- elif cookiecutter.tokenizer_type == "Based on BART" %}
-from ...utils import logging
-from ..bart.tokenization_bart import BartTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-
-
-class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
-    """
-    Construct a {{cookiecutter.modelname}} tokenizer.
-
-    [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BartTokenizer`] and runs end-to-end
-    tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-
-{%- elif cookiecutter.tokenizer_type == "Standalone" %}
-from typing import List, Optional
-
-from tokenizers import ByteLevelBPETokenizer
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
-    """
-    Construct a {{cookiecutter.modelname}} tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-            self,
-            vocab_file,
-            unk_token="<|endoftext|>",
-            bos_token="<|endoftext|>",
-            eos_token="<|endoftext|>",
-            **kwargs
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
-
-        """ Initialisation """
-
-    @property
-    def vocab_size(self):
-        """ Returns vocab size """
-
-    def get_vocab(self):
-        """ Returns vocab as a dict """
-
-    def _tokenize(self, text):
-        """ Returns a tokenized string. """
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-
-    def save_vocabulary(self, save_directory):
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-
-    def build_inputs_with_special_tokens(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A {{cookiecutter.modelname}} sequence has the following format:
-
-        - single sequence: `<s> X </s>`
-        - pair of sequences: `<s> A </s></s> B </s>`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
-        {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`:  List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
-
-class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-            self,
-            vocab_file,
-            merges_file,
-            unk_token="<|endoftext|>",
-            bos_token="<|endoftext|>",
-            eos_token="<|endoftext|>",
-            add_prefix_space=False,
-            trim_offsets=True,
-            **kwargs
-    ):
-        super().__init__(
-            ByteLevelBPETokenizer(
-                vocab_file=vocab_file,
-                merges_file=merges_file,
-                add_prefix_space=add_prefix_space,
-                trim_offsets=trim_offsets,
-            ),
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            **kwargs,
-        )
-        self.add_prefix_space = add_prefix_space
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-
-    def create_token_type_ids_from_sequences(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
-        {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`:  List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-{% endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# {{cookiecutter.modelname}}
-
-## Overview
-
-The {{cookiecutter.modelname}} model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
-
-## {{cookiecutter.camelcase_modelname}}Config
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}Config
-
-
-## {{cookiecutter.camelcase_modelname}}Tokenizer
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}Tokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-
-## {{cookiecutter.camelcase_modelname}}TokenizerFast
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}TokenizerFast
-
-
-{% if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-## {{cookiecutter.camelcase_modelname}}Model
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}Model
-    - forward
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-## {{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForCausalLM
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForMaskedLM
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForMaskedLM
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    - forward
-
-## {{cookiecutter.camelcase_modelname}}ForMultipleChoice
-
-[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForTokenClassification
-
-[[autodoc]] transformers.{{cookiecutter.camelcase_modelname}}ForTokenClassification
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    - forward
-
-{%- else %}
-## {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    - forward
-
-
-## {{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] {{cookiecutter.camelcase_modelname}}ForCausalLM
-    - forward
-
-
-{% endif -%}
-{% endif -%}
-{% if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-
-## TF{{cookiecutter.camelcase_modelname}}Model
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}Model
-    - call
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-## TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
-    - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForCausalLM
-    - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
-    - call
-
-
-## TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    - call
-
-
-{%- else %}
-## TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
-[[autodoc]] TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-    - call
-
-
-{% endif -%}
-{% endif -%}
-
-{% if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax -%}
-
-## Flax{{cookiecutter.camelcase_modelname}}Model
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}Model
-    - call
-
-{% if cookiecutter.is_encoder_decoder_model == "False" %}
-## Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForMaskedLM
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForCausalLM
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForTokenClassification
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    - call
-
-
-{%- else %}
-## Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForSequenceClassification
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
-    - call
-
-
-## Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-
-[[autodoc]] Flax{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
-    - call
-
-
-{% endif -%}
-{% endif -%}
--- a/templates/adding_a_new_model/cookiecutter.json
+++ b/templates/adding_a_new_model/cookiecutter.json
-{
-  "modelname": "BrandNewBERT",
-  "uppercase_modelname": "BRAND_NEW_BERT",
-  "lowercase_modelname": "brand_new_bert",
-  "camelcase_modelname": "BrandNewBert",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "brand-new-bert-base-cased",
-  "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"],
-  "generate_tensorflow_pytorch_and_flax": [
-    "PyTorch, TensorFlow and Flax",
-    "PyTorch & TensorFlow",
-    "PyTorch & Flax",
-    "TensorFlow & Flax",
-    "PyTorch",
-    "TensorFlow",
-    "Flax"
-  ],
-  "is_encoder_decoder_model": ["True", "False"]
-}
--- a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json
-{
-  "modelname": "Template",
-  "uppercase_modelname": "TEMPLATE",
-  "lowercase_modelname": "template",
-  "camelcase_modelname": "Template",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "brand-new-bert-base-cased",
-  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_pytorch_and_flax": "PyTorch, TensorFlow and Flax",
-  "is_encoder_decoder_model": "False"
-}
--- a/templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json
-{
-  "modelname": "TemplateFLAX",
-  "uppercase_modelname": "TEMPLATE_FLAX",
-  "lowercase_modelname": "template_flax",
-  "camelcase_modelname": "TemplateFlax",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "brand-new-bert-base-cased",
-  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_pytorch_and_flax": "Flax",
-  "is_encoder_decoder_model": "False"
-}
--- a/templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json
+++ b/templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json
-{
-  "modelname": "FlaxNewENCDEC",
-  "uppercase_modelname": "FLAX_NEW_ENC_DEC",
-  "lowercase_modelname": "flax_new_enc_dec_template",
-  "camelcase_modelname": "FlaxNewEncDec",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "new-flax-enc-dec-base",
-  "tokenizer_type": "Based on BART",
-  "generate_tensorflow_pytorch_and_flax": "Flax",
-  "is_encoder_decoder_model": "True"
-}
--- a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json
-{
-  "modelname": "TemplatePT",
-  "uppercase_modelname": "TEMPLATE_PT",
-  "lowercase_modelname": "template_pt",
-  "camelcase_modelname": "TemplatePt",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "brand-new-bert-base-cased",
-  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_pytorch_and_flax": "PyTorch",
-  "is_encoder_decoder_model": "False"
-}
--- a/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json
+++ b/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json
-{
-  "modelname": "PTNewENCDEC",
-  "uppercase_modelname": "PT_NEW_ENC_DEC",
-  "lowercase_modelname": "pt_new_enc_dec_template",
-  "camelcase_modelname": "PtNewEncDec",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "pt-new-enc-dec-base",
-  "tokenizer_type": "Based on BART",
-  "generate_tensorflow_pytorch_and_flax": "PyTorch",
-  "is_encoder_decoder_model": "True"
-}
--- a/templates/adding_a_new_model/tests/standalone.json
+++ b/templates/adding_a_new_model/tests/standalone.json
-{
-  "modelname": "TemplateBI",
-  "uppercase_modelname": "TEMPLATE_BI",
-  "lowercase_modelname": "template_bi",
-  "camelcase_modelname": "TemplateBi",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "bi-brand-new-bert-base-cased",
-  "tokenizer_type": "Standalone",
-  "generate_tensorflow_pytorch_and_flax": "PyTorch, TensorFlow and Flax",
-  "is_encoder_decoder_model": "False"
-}
--- a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json
-{
-  "modelname": "TemplateTF",
-  "uppercase_modelname": "TEMPLATE_TF",
-  "lowercase_modelname": "template_tf",
-  "camelcase_modelname": "TemplateTf",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "brand-new-bert-base-cased",
-  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_pytorch_and_flax": "TensorFlow",
-  "is_encoder_decoder_model": "False"
-}
--- a/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json
+++ b/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json
-{
-  "modelname": "NewTFENCDEC",
-  "uppercase_modelname": "NEW_TF_ENC_DEC",
-  "lowercase_modelname": "new_tf_enc_dec_template",
-  "camelcase_modelname": "NewTFEncDec",
-  "authors": "The HuggingFace Team",
-  "checkpoint_identifier": "new-tf-enc-dec-base_template",
-  "tokenizer_type": "Based on BART",
-  "generate_tensorflow_pytorch_and_flax": "TensorFlow",
-  "is_encoder_decoder_model": "True"
-}
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -335,7 +335,6 @@ src/transformers/benchmark/benchmark_args_tf.py
 src/transformers/benchmark/benchmark_args_utils.py
 src/transformers/benchmark/benchmark_tf.py
 src/transformers/benchmark/benchmark_utils.py
-src/transformers/commands/add_new_model.py
 src/transformers/commands/add_new_model_like.py
 src/transformers/commands/convert.py
 src/transformers/commands/download.py