TFMarian, TFMbart, TFPegasus, TFBlenderbot (#7987)

* Start plumbing * Marian close * Small stubs for all children * Fixed bart * marian working * pegasus test is good, but failing * Checkin tests * More model files * Subtle marian, pegasus integration test failures * Works well * rm print * boom boom * Still failing model2doc * merge master * Equivalence test failing, all others fixed * cleanup * Fix embed_scale * Cleanup marian pipeline test * Undo extra changes * Smaller delta * Cleanup model testers * undo delta * fix tests import structure * cross test decorator * Cleaner set_weights * Respect authorized_unexpected_keys * No warnings * No warnings * style * Nest tf import * black * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * functional dropout * fixup * Fixup * style_doc * embs * shape list * delete slow force_token_id_to_be_generated func * fixup Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

TFMarian, TFMbart, TFPegasus, TFBlenderbot (#7987)
* Start plumbing * Marian close * Small stubs for all children * Fixed bart * marian working * pegasus test is good, but failing * Checkin tests * More model files * Subtle marian, pegasus integration test failures * Works well * rm print * boom boom * Still failing model2doc * merge master * Equivalence test failing, all others fixed * cleanup * Fix embed_scale * Cleanup marian pipeline test * Undo extra changes * Smaller delta * Cleanup model testers * undo delta * fix tests import structure * cross test decorator * Cleaner set_weights * Respect authorized_unexpected_keys * No warnings * No warnings * style * Nest tf import * black * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * functional dropout * fixup * Fixup * style_doc * embs * shape list * delete slow force_token_id_to_be_generated func * fixup Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
566b083e · Sam Shleifer · GitHub · 6279072f · 566b083e · 566b083e
Unverified Commit 566b083e authored Oct 30, 2020 by Sam Shleifer Committed by GitHub Oct 30, 2020
20 changed files
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -95,3 +95,12 @@ See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward`
 .. autoclass:: transformers.BlenderbotForConditionalGeneration
    :members:
+TFBlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+See :obj:`transformers.TFBartForConditionalGeneration` for arguments to `forward` and `generate`
+.. autoclass:: transformers.TFBlenderbotForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -129,3 +129,9 @@ MarianMTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.MarianMTModel
+TFMarianMTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.TFMarianMTModel
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -79,4 +79,11 @@ MBartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.MBartForConditionalGeneration
-    :members: forward
+    :members:
+TFMBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.TFMBartForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -95,3 +95,9 @@ PegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.PegasusForConditionalGeneration
+TFPegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.TFPegasusForConditionalGeneration
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -670,6 +670,7 @@ if is_tf_available():
        TFBertModel,
        TFBertPreTrainedModel,
    )
+    from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
    from .modeling_tf_camembert import (
        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFCamembertForMaskedLM,
@@ -750,6 +751,8 @@ if is_tf_available():
        TFLxmertPreTrainedModel,
        TFLxmertVisualFeatureEncoder,
    )
+    from .modeling_tf_marian import TFMarianMTModel
+    from .modeling_tf_mbart import TFMBartForConditionalGeneration
    from .modeling_tf_mobilebert import (
        TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFMobileBertForMaskedLM,
@@ -771,6 +774,7 @@ if is_tf_available():
        TFOpenAIGPTModel,
        TFOpenAIGPTPreTrainedModel,
    )
+    from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
    from .modeling_tf_roberta import (
        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFRobertaForMaskedLM,

--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -427,7 +427,6 @@ class DecoderLayer(nn.Module):
        output_attentions=False,
    ):
        residual = x
        if layer_state is None:
            layer_state = {}
        if self.normalize_before:
@@ -447,7 +446,7 @@ class DecoderLayer(nn.Module):
        if not self.normalize_before:
            x = self.self_attn_layer_norm(x)
-        # Cross attention
+        # Cross-Attention Block
        residual = x
        assert self.encoder_attn.cache_key != self.self_attn.cache_key
        if self.normalize_before:
@@ -628,7 +627,6 @@ class BartDecoder(nn.Module):
        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(

--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -41,6 +41,10 @@ from .configuration_auto import (
    XLNetConfig,
    replace_list_option_in_docstrings,
 )
+from .configuration_blenderbot import BlenderbotConfig
+from .configuration_marian import MarianConfig
+from .configuration_mbart import MBartConfig
+from .configuration_pegasus import PegasusConfig
 from .configuration_utils import PretrainedConfig
 from .file_utils import add_start_docstrings
 from .modeling_tf_albert import (
@@ -63,6 +67,7 @@ from .modeling_tf_bert import (
    TFBertLMHeadModel,
    TFBertModel,
 )
+from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
 from .modeling_tf_camembert import (
    TFCamembertForMaskedLM,
    TFCamembertForMultipleChoice,
@@ -108,6 +113,8 @@ from .modeling_tf_funnel import (
 )
 from .modeling_tf_gpt2 import TFGPT2LMHeadModel, TFGPT2Model
 from .modeling_tf_longformer import TFLongformerForMaskedLM, TFLongformerForQuestionAnswering, TFLongformerModel
+from .modeling_tf_marian import TFMarianMTModel
+from .modeling_tf_mbart import TFMBartForConditionalGeneration
 from .modeling_tf_mobilebert import (
    TFMobileBertForMaskedLM,
    TFMobileBertForMultipleChoice,
@@ -118,6 +125,7 @@ from .modeling_tf_mobilebert import (
    TFMobileBertModel,
 )
 from .modeling_tf_openai import TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
+from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
 from .modeling_tf_roberta import (
    TFRobertaForMaskedLM,
    TFRobertaForMultipleChoice,
@@ -210,6 +218,7 @@ TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
        (T5Config, TFT5ForConditionalGeneration),
        (DistilBertConfig, TFDistilBertForMaskedLM),
        (AlbertConfig, TFAlbertForMaskedLM),
+        (MarianConfig, TFMarianMTModel),
        (BartConfig, TFBartForConditionalGeneration),
        (CamembertConfig, TFCamembertForMaskedLM),
        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
@@ -261,8 +270,16 @@ TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
    ]
 )
 TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
-    [(T5Config, TFT5ForConditionalGeneration), (BartConfig, TFBartForConditionalGeneration)]
+    [
+        (T5Config, TFT5ForConditionalGeneration),
+        (MarianConfig, TFMarianMTModel),
+        (MBartConfig, TFMBartForConditionalGeneration),
+        (PegasusConfig, TFPegasusForConditionalGeneration),
+        (BlenderbotConfig, TFBlenderbotForConditionalGeneration),
+        (BartConfig, TFBartForConditionalGeneration),
+    ]
 )
 TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(

--- a/src/transformers/modeling_tf_bart.py
+++ b/src/transformers/modeling_tf_bart.py
@@ -19,9 +19,10 @@ import random
 import warnings
 from typing import Dict, Optional, Tuple
+import numpy as np
 import tensorflow as tf
 from tensorflow import Tensor
-from tensorflow.keras.layers import Dense, LayerNormalization
+from tensorflow.keras.layers import Dense, Layer, LayerNormalization
 from .activations_tf import ACT2FN
 from .configuration_bart import BartConfig
@@ -43,7 +44,6 @@ from .utils import logging
 _CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
 BART_START_DOCSTRING = r"""
@@ -218,22 +218,21 @@ PAST_KV_DEPRECATION_WARNING = (
 )
-class TFEncoderLayer(tf.keras.layers.Layer):
+class TFEncoderLayer(Layer):
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = config.d_model
        self.self_attn = TFAttention(
            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
        )
+        self.normalize_before = config.normalize_before
-        self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.self_attn_layer_norm = LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
-        self.dropout_wt = tf.keras.layers.Dropout(config.dropout)
+        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
+        self.activation_dropout = config.activation_dropout
        self.fc1 = Dense(config.encoder_ffn_dim, name="fc1")
        self.fc2 = Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = LayerNormalization(epsilon=1e-5, name="final_layer_norm")
-        self.normalize_before = config.normalize_before
    def call(self, x, encoder_padding_mask, training=False):
        """
@@ -251,8 +250,10 @@ class TFEncoderLayer(tf.keras.layers.Layer):
        if self.normalize_before:
            x = self.self_attn_layer_norm(x)
        x, self_attn_weights = self.self_attn(query=x, key=x, key_padding_mask=encoder_padding_mask)
-        assert x.shape == residual.shape, f"Self attn modified the shape of query {residual.shape} to {x.shape}"
+        assert shape_list(x) == shape_list(
-        x = self.dropout_wt(x, training=training)
+            residual
+        ), f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(x)}"
+        x = tf.nn.dropout(x, rate=self.dropout if training else 0)
        x = residual + x
        if not self.normalize_before:
            x = self.self_attn_layer_norm(x)
@@ -261,9 +262,9 @@ class TFEncoderLayer(tf.keras.layers.Layer):
        if self.normalize_before:
            x = self.final_layer_norm(x)
        x = self.activation_fn(self.fc1(x))
-        x = self.activation_dropout(x, training=training)
+        x = tf.nn.dropout(x, rate=self.self.activation_dropout if training else 0)
        x = self.fc2(x)
-        x = self.dropout_wt(x, training=training)
+        x = tf.nn.dropout(x, rate=self.dropout if training else 0)
        x = residual + x
        if not self.normalize_before:
            x = self.final_layer_norm(x)
@@ -271,7 +272,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
        return x, self_attn_weights
-class TFBartEncoder(tf.keras.layers.Layer):
+class TFBartEncoder(Layer):
    # config_class = BartConfig
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -289,26 +290,30 @@ class TFBartEncoder(tf.keras.layers.Layer):
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
-        embed_dim = embed_tokens.vocab_size
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_tokens = embed_tokens
-        self.embed_positions = TFLearnedPositionalEmbedding(
+        if config.static_position_embeddings:
-            config.max_position_embeddings,
+            self.embed_positions = TFSinusoidalPositionalEmbedding(
-            embed_tokens.hidden_size,
+                config.max_position_embeddings,
-            self.padding_idx,
+                config.d_model,
-            config.extra_pos_embeddings,
+                name="embed_positions",
-            name="embed_positions",
+            )
-        )
+        else:
+            self.embed_positions = TFLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+                name="embed_positions",
+            )
        self.layers = [TFEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
-        self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
+        self.layernorm_embedding = (
-        self.layer_norm = (
+            LayerNormalization(epsilon=1e-5, name="layernorm_embedding") if config.normalize_embedding else Layer()
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
-            if config.add_final_layer_norm
-            else None
        )
+        self.layer_norm = LayerNormalization(epsilon=1e-5, name="layer_norm") if config.add_final_layer_norm else None
        self.return_dict = config.return_dict
    def call(
@@ -347,7 +352,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
            ), f"expected attention_mask._rank() to be a 2D tensor got {attention_mask._rank()}"
            attention_mask = tf.cast(attention_mask, dtype=tf.float32)
            attention_mask = (1.0 - attention_mask) * LARGE_NEGATIVE
-        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
        embed_pos = self.embed_positions(input_ids)
        x = inputs_embeds + embed_pos
        x = self.layernorm_embedding(x)
@@ -384,7 +389,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
        return TFBaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)
-class TFDecoderLayer(tf.keras.layers.Layer):
+class TFDecoderLayer(Layer):
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = config.d_model
@@ -397,8 +402,9 @@ class TFDecoderLayer(tf.keras.layers.Layer):
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
+        self.normalize_before = config.normalize_before
-        self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.self_attn_layer_norm = LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        self.encoder_attn = TFAttention(
            self.embed_dim,
            config.decoder_attention_heads,
@@ -406,10 +412,10 @@ class TFDecoderLayer(tf.keras.layers.Layer):
            encoder_decoder_attention=True,
            name="encoder_attn",
        )
-        self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
+        self.encoder_attn_layer_norm = LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
        self.fc1 = Dense(config.decoder_ffn_dim, name="fc1")
        self.fc2 = Dense(self.embed_dim, name="fc2")
-        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.final_layer_norm = LayerNormalization(epsilon=1e-5, name="final_layer_norm")
    def call(
        self,
@@ -433,10 +439,12 @@ class TFDecoderLayer(tf.keras.layers.Layer):
            Tuple containing, encoded output of shape `(seq_len, batch, embed_dim)`, self_attn_weights, layer_state
        """
+        residual = x  # Make a copy of the input tensor to add later.
        if layer_state is None:
            layer_state = {}
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
-        residual = x  # Make a copy of the input tensor to add later.
        # next line mutates layer state and we need a copy of it
        x, self_attn_weights = self.self_attn(
            query=x,
@@ -447,9 +455,12 @@ class TFDecoderLayer(tf.keras.layers.Layer):
        )
        x = tf.nn.dropout(x, rate=self.dropout if training else 0)
        x = residual + x
-        x = self.self_attn_layer_norm(x)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        # Cross-Attention Block
        residual = x
-        # Cross-Attention
+        if self.normalize_before:
+            x = self.encoder_attn_layer_norm(x)
        x, _ = self.encoder_attn(
            query=x,
            key=encoder_hidden_states,
@@ -458,16 +469,19 @@ class TFDecoderLayer(tf.keras.layers.Layer):
        )
        x = tf.nn.dropout(x, rate=self.dropout if training else 0)
        x = residual + x
+        if not self.normalize_before:
-        x = self.encoder_attn_layer_norm(x)
+            x = self.encoder_attn_layer_norm(x)
+        # Fully Connected
        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
        x = self.activation_fn(self.fc1(x))
        x = tf.nn.dropout(x, rate=self.activation_dropout if training else 0)
        x = self.fc2(x)
        x = tf.nn.dropout(x, rate=self.dropout if training else 0)
        x = residual + x
-        x = self.final_layer_norm(x)
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
        return (
            x,
            self_attn_weights,
@@ -475,7 +489,7 @@ class TFDecoderLayer(tf.keras.layers.Layer):
        )  # just self_attn weights for now, following t5, layer_state = cache for decoding
-class TFBartDecoder(tf.keras.layers.Layer):
+class TFBartDecoder(Layer):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFDecoderLayer`
@@ -491,26 +505,27 @@ class TFBartDecoder(tf.keras.layers.Layer):
        self.max_target_positions = config.max_position_embeddings
        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.embed_positions = TFLearnedPositionalEmbedding(
+        if config.static_position_embeddings:
-            config.max_position_embeddings,
+            self.embed_positions = TFSinusoidalPositionalEmbedding(
-            config.d_model,
+                config.max_position_embeddings,
-            self.padding_idx,
+                config.d_model,
-            config.extra_pos_embeddings,
+                name="embed_positions",
-            name="embed_positions",
+            )
-        )
+        else:
+            self.embed_positions = TFLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+                name="embed_positions",
+            )
        self.layers = [TFDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
        self.layernorm_embedding = (
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
+            LayerNormalization(epsilon=1e-5, name="layernorm_embedding") if config.normalize_embedding else Layer()
-            if config.normalize_embedding
-            else tf.identity
-        )
-        self.layer_norm = (
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
-            if config.add_final_layer_norm
-            else None
        )
+        self.layer_norm = LayerNormalization(epsilon=1e-5, name="layer_norm") if config.add_final_layer_norm else None
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.dropout = config.dropout
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
        self.use_cache = config.use_cache
@@ -553,11 +568,11 @@ class TFBartDecoder(tf.keras.layers.Layer):
            x = self.layernorm_embedding(x) + positions
        else:
            x = self.layernorm_embedding(x + positions)
-        x = self.dropout(x)
+        x = tf.nn.dropout(x, rate=self.dropout if training else 0)
        # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
        x = tf.transpose(x, perm=(1, 0, 2))
-        assert len(encoder_hidden_states.shape) == 3, "encoder_hidden_states must be a 3D tensor"
+        assert len(shape_list(encoder_hidden_states)) == 3, "encoder_hidden_states must be a 3D tensor"
        encoder_hidden_states = tf.transpose(encoder_hidden_states, perm=(1, 0, 2))
        # decoder layers
@@ -623,7 +638,7 @@ def _reorder_buffer(attn_cache, new_order):
    return attn_cache
-class TFAttention(tf.keras.layers.Layer):
+class TFAttention(Layer):
    """Multi-headed attention from "Attention Is All You Need"""
    def __init__(
@@ -678,8 +693,10 @@ class TFAttention(tf.keras.layers.Layer):
                (default: None).
        """
        static_kv = self.encoder_decoder_attention  # value=key=encoder_hidden_states,
-        tgt_len, bsz, embed_dim = query.shape
+        tgt_len, bsz, embed_dim = shape_list(query)
-        assert embed_dim == self.embed_dim, f"query must be shaped {(tgt_len, bsz, self.embed_dim)} got {query.shape}"
+        assert (
+            embed_dim == self.embed_dim
+        ), f"query must be shaped {(tgt_len, bsz, self.embed_dim)} got {shape_list(query)}"
        # get here for encoder decoder cause of static_kv
        if layer_state is not None:  # get the last k and v for reuse
            saved_state = layer_state.get(self.cache_key, {})
@@ -718,7 +735,7 @@ class TFAttention(tf.keras.layers.Layer):
            )
        # Compute multi-headed attention
-        src_len = k.shape[1]
+        src_len = shape_list(k)[1]
        attn_weights = tf.matmul(q, k, transpose_b=True)  # shape (bsz * self.num_heads, tgt_len, src_len)
        if attn_mask is not None:
@@ -770,7 +787,7 @@ class TFLearnedPositionalEmbedding(TFSharedEmbeddings):
    def call(self, input_ids: tf.Tensor, use_cache=False):
        """Input is expected to be of size [bsz x seqlen]."""
-        bsz, seq_len = input_ids.shape[:2]
+        bsz, seq_len = shape_list(input_ids)[:2]
        if use_cache:
            positions = tf.fill((1, 1), seq_len - 1)
@@ -780,6 +797,56 @@ class TFLearnedPositionalEmbedding(TFSharedEmbeddings):
        return super().call(positions + self.offset)  # super object is not callable for some reason
+class TFSinusoidalPositionalEmbedding(tf.keras.layers.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+    def __init__(self, num_positions, embedding_dim, **kwargs):
+        if embedding_dim % 2 != 0:
+            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
+        super().__init__(
+            num_positions,
+            embedding_dim,
+            **kwargs,
+        )
+    def build(self, input_shape):
+        """
+        Build shared token embedding layer Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        super().build(input_shape)  # Instantiates self.weight so it can be loaded
+        weight: np.ndarray = self._init_weight(self.input_dim, self.output_dim)
+        self.set_weights([weight])  # overwrite self.weight to correct value
+    @staticmethod
+    def _init_weight(n_pos, dim):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        # index 0 is all zero
+        position_enc[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
+        position_enc[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
+        # convert to tensor
+        table = tf.convert_to_tensor(position_enc, dtype=tf.float32)
+        tf.stop_gradient(table)
+        return table
+    def call(self, input_ids, use_cache=False):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = shape_list(input_ids)[:2]
+        if use_cache:
+            positions = tf.fill((1, 1), seq_len - 1)
+        else:
+            # starts at 0, ends at 1-seq_len
+            positions = tf.range(0, seq_len, delta=1, dtype=tf.int32, name="range")
+        return super().call(positions)
 # Public API
@@ -818,7 +885,7 @@ class TFBartModel(TFPretrainedBartModel):
        pad_token_id = self.config.pad_token_id
        if decoder_input_ids is None:
            decoder_input_ids = self._shift_right(inputs)
-        bsz, tgt_len = decoder_input_ids.shape[:2]
+        bsz, tgt_len = shape_list(decoder_input_ids)[:2]
        if decoder_attn_mask is None:
            decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
        else:
@@ -950,16 +1017,20 @@ class TFBartForConditionalGeneration(TFPretrainedBartModel):
    base_model_prefix = "model"
    authorized_missing_keys = [
        r"final_logits_bias",
-        r"encoder\.version",
+    ]
-        r"decoder\.version",
+    authorized_unexpected_keys = [
-        "model.encoder.embed_tokens.weight",
+        r"model.encoder.embed_tokens.weight",
-        "model.decoder.embed_tokens.weight",
+        r"model.decoder.embed_tokens.weight",
    ]
    def __init__(self, config: BartConfig, *args, **kwargs):
        super().__init__(config, *args, **kwargs)
        self.model = TFBartModel(config, name="model")
        self.use_cache = config.use_cache
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        self.final_logits_bias = self.add_weight(
+            name="/final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
+        )
    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -1050,6 +1121,7 @@ class TFBartForConditionalGeneration(TFPretrainedBartModel):
            return_dict=True,  # TODO(SS): this may need to change to support compilation
        )
        logits = self.model.shared(outputs.last_hidden_state, mode="linear")
+        logits = logits + self.final_logits_bias
        loss = None if labels is None else self.compute_loss(labels, logits)
        past = outputs.past_key_values if cast_bool_to_primitive(use_cache, self.config.use_cache) else None
@@ -1096,7 +1168,7 @@ class TFBartForConditionalGeneration(TFPretrainedBartModel):
            ), f"decoder cached states must be truthy. got {decoder_cached_states} from the 2nd element of past"
        assert isinstance(
            encoder_outputs, TFBaseModelOutput
-        ), "encoder_outputs should be a TFBaseModelOutput, Instead got "
+        ), f"encoder_outputs should be a TFBaseModelOutput, Instead got {type(encoder_outputs)}."
        return {
            "inputs": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
@@ -1113,7 +1185,6 @@ class TFBartForConditionalGeneration(TFPretrainedBartModel):
        reordered_past = []
        for layer_past in decoder_cached_states:
            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
            layer_past_new = {
                attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
            }
@@ -1124,26 +1195,13 @@ class TFBartForConditionalGeneration(TFPretrainedBartModel):
    def adjust_logits_during_generation(self, logits, cur_len, max_length):
        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
-            logits = self._force_token_id_to_be_generated(logits, self.config.bos_token_id)
+            vocab_range = tf.constant(range(self.config.vocab_size))
-        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            return tf.where(vocab_range != self.config.bos_token_id, LARGE_NEGATIVE, logits)
-            logits = self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        elif cur_len == max_length - 1:
-        return logits
+            vocab_range = tf.constant(range(self.config.vocab_size))
+            return tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
-    @staticmethod
+        else:
-    def _force_token_id_to_be_generated(scores, token_id) -> None:
+            return logits
-        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
-        output_list = []
-        # Is there a better way to do scores[:, [x for if x != token_id]] = -float("inf") in TF?
-        bs, vocab_size = scores.shape
-        for x in range(vocab_size):
-            if x != token_id:
-                output_list.append(tf.convert_to_tensor([-float("inf")] * bs, dtype=scores.dtype))
-            else:
-                output_list.append(scores[:, x])
-        scores = tf.stack(output_list, axis=1, name="scores")
-        assert scores.shape == (bs, vocab_size)
-        return scores
    def get_output_embeddings(self):
        return self.model.shared

--- a/src/transformers/modeling_tf_blenderbot.py
+++ b/src/transformers/modeling_tf_blenderbot.py
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF BlenderBot model, ported from the fairseq repo."""
+from .configuration_blenderbot import BlenderbotConfig
+from .file_utils import add_start_docstrings, is_tf_available
+from .modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
+from .utils import logging
+if is_tf_available():
+    import tensorflow as tf
+_CONFIG_FOR_DOC = "BlenderbotConfig"
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+logger = logging.get_logger(__name__)
+@add_start_docstrings("Blenderbot model for open domain dialogue", START_DOCSTRING)
+class TFBlenderbotForConditionalGeneration(TFBartForConditionalGeneration):
+    config_class = BlenderbotConfig
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        """Never predict pad_token_id. Predict </s> when max_length is reached."""
+        vocab_range = tf.constant(range(self.config.vocab_size))
+        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
+        if cur_len == max_length - 1:
+            logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
+        return logits
--- a/src/transformers/modeling_tf_marian.py
+++ b/src/transformers/modeling_tf_marian.py
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF Marian model, ported from the fairseq repo."""
+from .configuration_marian import MarianConfig
+from .file_utils import add_start_docstrings, is_tf_available
+from .modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
+from .utils import logging
+if is_tf_available():
+    import tensorflow as tf
+_CONFIG_FOR_DOC = "MarianConfig"
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+logger = logging.get_logger(__name__)
+@add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
+class TFMarianMTModel(TFBartForConditionalGeneration):
+    authorized_missing_keys = [
+        r"model.encoder.embed_positions.weight",
+        r"model.decoder.embed_positions.weight",
+    ]
+    config_class = MarianConfig
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        """Never predict pad_token_id. Predict </s> when max_length is reached."""
+        vocab_range = tf.constant(range(self.config.vocab_size))
+        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
+        if cur_len == max_length - 1:
+            logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
+        return logits
--- a/src/transformers/modeling_tf_mbart.py
+++ b/src/transformers/modeling_tf_mbart.py
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF mBART model, originally from fairseq."""
+from .configuration_mbart import MBartConfig
+from .file_utils import add_start_docstrings
+from .modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
+from .utils import logging
+_CONFIG_FOR_DOC = "MBartConfig"
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+logger = logging.get_logger(__name__)
+@add_start_docstrings("mBART (multilingual BART) model for machine translation", START_DOCSTRING)
+class TFMBartForConditionalGeneration(TFBartForConditionalGeneration):
+    config_class = MBartConfig
+    # All the code is in src/transformers/modeling_tf_bart.py
--- a/src/transformers/modeling_tf_pegasus.py
+++ b/src/transformers/modeling_tf_pegasus.py
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF Pegasus model, ported from the fairseq repo."""
+from .configuration_pegasus import PegasusConfig
+from .file_utils import add_start_docstrings
+from .modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
+from .utils import logging
+_CONFIG_FOR_DOC = "PegasusConfig"
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+logger = logging.get_logger(__name__)
+@add_start_docstrings("Pegasus model for summarization", START_DOCSTRING)
+class TFPegasusForConditionalGeneration(TFBartForConditionalGeneration):
+    authorized_missing_keys = [
+        r"final_logits_bias",
+        r"model.encoder.embed_positions.weight",
+        r"model.decoder.embed_positions.weight",
+    ]
+    config_class = PegasusConfig
+    # All the code is in src/transformers/modeling_tf_bart.py
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -325,6 +325,15 @@ class TFBertPreTrainedModel:
        requires_tf(self)
+class TFBlenderbotForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_tf(self)
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_tf(self)
 TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -797,6 +806,24 @@ class TFLxmertVisualFeatureEncoder:
        requires_tf(self)
+class TFMarianMTModel:
+    def __init__(self, *args, **kwargs):
+        requires_tf(self)
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_tf(self)
+class TFMBartForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_tf(self)
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_tf(self)
 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -922,6 +949,15 @@ class TFOpenAIGPTPreTrainedModel:
        requires_tf(self)
+class TFPegasusForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_tf(self)
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_tf(self)
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -138,7 +138,7 @@ class MarianIntegrationTest(unittest.TestCase):
        )
        self.assertEqual(self.model.device, model_inputs.input_ids.device)
        generated_ids = self.model.generate(
-            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
        )
        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        return generated_words
@@ -244,6 +244,8 @@ class TestMarian_RU_FR(MarianIntegrationTest):
 @require_sentencepiece
 @require_tokenizers
 class TestMarian_MT_EN(MarianIntegrationTest):
+    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
    src = "mt"
    tgt = "en"
    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]

--- a/tests/test_modeling_tf_bart.py
+++ b/tests/test_modeling_tf_bart.py
@@ -17,7 +17,9 @@
 import tempfile
 import unittest
-from transformers import is_tf_available
+import numpy as np
+from transformers import BartConfig, BartTokenizer, is_tf_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import is_pt_tf_cross_test, require_tf, slow
@@ -28,12 +30,16 @@ from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
 if is_tf_available():
    import tensorflow as tf
-    from transformers import BartConfig, TFBartForConditionalGeneration, TFBartModel
+    from transformers import TFBartForConditionalGeneration, TFBartModel
-    from transformers.tokenization_bart import BartTokenizer
+    from transformers.modeling_tf_bart import TFSinusoidalPositionalEmbedding
 @require_tf
-class ModelTester:
+class TFBartModelTester:
+    config_cls = BartConfig
+    config_updates = {}
+    hidden_act = "gelu"
    def __init__(self, parent):
        self.parent = parent
        self.batch_size = 13
@@ -45,14 +51,13 @@ class ModelTester:
        self.num_hidden_layers = 5
        self.num_attention_heads = 4
        self.intermediate_size = 37
-        self.hidden_act = "gelu"
        self.hidden_dropout_prob = 0.1
        self.attention_probs_dropout_prob = 0.1
        self.max_position_embeddings = 20
        self.eos_token_ids = [2]
        self.pad_token_id = 1
        self.bos_token_id = 0
-        # torch.manual_seed(0)
    def prepare_config_and_inputs_for_common(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
@@ -60,7 +65,7 @@ class ModelTester:
        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
        input_ids = tf.clip_by_value(input_ids, 3, self.vocab_size + 1)
-        config = BartConfig(
+        config = self.config_cls(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
@@ -76,6 +81,7 @@ class ModelTester:
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
        )
        inputs_dict = prepare_bart_inputs_dict(config, input_ids)
        return config, inputs_dict
@@ -101,9 +107,10 @@ class TestTFBart(TFModelTesterMixin, unittest.TestCase):
    all_generative_model_classes = (TFBartForConditionalGeneration,) if is_tf_available() else ()
    is_encoder_decoder = True
    test_pruning = False
+    model_tester_cls = TFBartModelTester
    def setUp(self):
-        self.model_tester = ModelTester(self)
+        self.model_tester = self.model_tester_cls(self)
        self.config_tester = ConfigTester(self, config_class=BartConfig)
    def test_config(self):
@@ -120,7 +127,7 @@ class TestTFBart(TFModelTesterMixin, unittest.TestCase):
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-        model_class = TFBartForConditionalGeneration
+        model_class = self.all_generative_model_classes[0]
        input_ids = {
            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
@@ -354,3 +361,29 @@ class FasterTFBartModelIntegrationTests(unittest.TestCase):
        expected = np.array([[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]])
        assert np.allclose(features[0, :3, :3].numpy(), expected, atol=1e-3)
+@require_tf
+class TestTFSinusoidalPositionalEmbeddings(unittest.TestCase):
+    desired_weights = [
+        [0, 0, 0, 0, 0],
+        [0.84147096, 0.82177866, 0.80180490, 0.78165019, 0.76140374],
+        [0.90929741, 0.93651021, 0.95829457, 0.97505713, 0.98720258],
+    ]
+    def test_positional_emb_cache_logic(self):
+        input_ids = _long_tensor([[4, 10]])
+        emb1 = TFSinusoidalPositionalEmbedding(num_positions=32, embedding_dim=6)
+        no_cache = emb1(input_ids, use_cache=False)
+        yes_cache = emb1(input_ids, use_cache=True)
+        self.assertEqual((1, 1, 6), yes_cache.shape)  # extra dim to allow broadcasting, feel free to delete!
+        np.testing.assert_almost_equal(no_cache[-1].numpy(), yes_cache[0][0].numpy())
+    def test_positional_emb_weights_against_marian(self):
+        emb1 = TFSinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512)
+        emb1.build(None)
+        weights = emb1.embeddings.numpy()
+        for i, (expected_weight, actual_weight) in enumerate(zip(self.desired_weights, weights)):
+            for j in range(5):
+                self.assertAlmostEqual(expected_weight[j], actual_weight[j], places=3)
--- a/tests/test_modeling_tf_blenderbot.py
+++ b/tests/test_modeling_tf_blenderbot.py
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+from tests.test_configuration_common import ConfigTester
+from tests.test_modeling_tf_bart import TFBartModelTester
+from tests.test_modeling_tf_common import TFModelTesterMixin
+from transformers import BlenderbotConfig, BlenderbotSmallTokenizer, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_tokenizers, slow
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModelForSeq2SeqLM, TFBlenderbotForConditionalGeneration
+class ModelTester(TFBartModelTester):
+    config_updates = dict(
+        normalize_before=True,
+        static_position_embeddings=True,
+        do_blenderbot_90_layernorm=True,
+        normalize_embeddings=True,
+    )
+    config_cls = BlenderbotConfig
+@require_tf
+class TestTFBlenderbotCommon(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlenderbotForConditionalGeneration,) if is_tf_available() else ()
+    all_generative_model_classes = (TFBlenderbotForConditionalGeneration,) if is_tf_available() else ()
+    model_tester_cls = ModelTester
+    is_encoder_decoder = True
+    test_pruning = False
+    def setUp(self):
+        self.model_tester = self.model_tester_cls(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_inputs_embeds(self):
+        # inputs_embeds not supported
+        pass
+    def test_saved_model_with_hidden_states_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_saved_model_with_attentions_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pretrained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+@is_pt_tf_cross_test
+@require_tokenizers
+class TFBlenderbot90MIntegrationTests(unittest.TestCase):
+    src_text = [
+        "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like   i'm going to throw up.\nand why is that?"
+    ]
+    model_name = "facebook/blenderbot-90M"
+    @cached_property
+    def tokenizer(self):
+        return BlenderbotSmallTokenizer.from_pretrained(self.model_name)
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name, from_pt=True)
+        return model
+    @slow
+    def test_90_generation_from_long_input(self):
+        model_inputs = self.tokenizer(self.src_text, return_tensors="tf")
+        generated_ids = self.model.generate(
+            model_inputs.input_ids,
+            attention_mask=model_inputs.attention_mask,
+            num_beams=2,
+            use_cache=True,
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)[0]
+        assert generated_words in (
+            "i don't know. i just feel like i'm going to throw up. it's not fun.",
+            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
+            "i'm not sure. i just feel like i've been in a bad situation.",
+        )
--- a/tests/test_modeling_tf_marian.py
+++ b/tests/test_modeling_tf_marian.py
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+import warnings
+from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import is_pt_tf_cross_test, require_sentencepiece, require_tf, require_tokenizers, slow
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_bart import TFBartModelTester
+from .test_modeling_tf_common import TFModelTesterMixin
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModelForSeq2SeqLM, TFMarianMTModel
+class ModelTester(TFBartModelTester):
+    config_updates = dict(static_position_embeddings=True, add_bias_logits=True)
+    config_cls = MarianConfig
+@require_tf
+class TestTFMarianCommon(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
+    all_generative_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
+    model_tester_cls = ModelTester
+    is_encoder_decoder = True
+    test_pruning = False
+    def setUp(self):
+        self.model_tester = self.model_tester_cls(self)
+        self.config_tester = ConfigTester(self, config_class=MarianConfig)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_inputs_embeds(self):
+        # inputs_embeds not supported
+        pass
+    def test_saved_model_with_hidden_states_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_saved_model_with_attentions_output(self):
+        pass
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pre-trained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+class AbstractMarianIntegrationTest(unittest.TestCase):
+    maxDiff = 1000  # show more chars for failing integration tests
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
+        return cls
+    @cached_property
+    def tokenizer(self) -> MarianTokenizer:
+        return AutoTokenizer.from_pretrained(self.model_name)
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+    @cached_property
+    def model(self):
+        warnings.simplefilter("error")
+        model: TFMarianMTModel = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name, from_pt=True)
+        assert isinstance(model, TFMarianMTModel)
+        c = model.config
+        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
+        self.assertEqual(c.max_length, 512)
+        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
+        return model
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        self.assertListEqual(self.expected_text, generated_words)
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer.prepare_seq2seq_batch(
+            src_texts=self.src_text, **tokenizer_kwargs, return_tensors="tf"
+        )
+        generated_ids = self.model.generate(
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
+        return generated_words
+@require_sentencepiece
+@require_tokenizers
+@is_pt_tf_cross_test
+class TestMarian_MT_EN(AbstractMarianIntegrationTest):
+    """Cover low resource/high perplexity setting. This breaks if pad_token_id logits not set to LARGE_NEGATIVE."""
+    src = "mt"
+    tgt = "en"
+    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
+    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
+    @slow
+    def test_batch_generation_mt_en(self):
+        self._assert_generated_batch_equal_expected()
+@is_pt_tf_cross_test
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_en_zh(AbstractMarianIntegrationTest):
+    src = "en"
+    tgt = "zh"
+    src_text = ["My name is Wolfgang and I live in Berlin"]
+    expected_text = ["我叫沃尔夫冈 我住在柏林"]
+    @slow
+    def test_batch_generation_en_zh(self):
+        self._assert_generated_batch_equal_expected()
+@is_pt_tf_cross_test
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_en_ROMANCE(AbstractMarianIntegrationTest):
+    """Multilingual on target side."""
+    src = "en"
+    tgt = "ROMANCE"
+    src_text = [
+        ">>fr<< Don't spend so much time watching TV.",
+        ">>pt<< Your message has been sent.",
+        ">>es<< He's two years older than me.",
+    ]
+    expected_text = [
+        "Ne passez pas autant de temps à regarder la télé.",
+        "A sua mensagem foi enviada.",
+        "Es dos años más viejo que yo.",
+    ]
+    @slow
+    def test_batch_generation_en_ROMANCE_multi(self):
+        self._assert_generated_batch_equal_expected()
+    @slow
+    def test_pipeline(self):
+        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="tf")
+        output = pipeline(self.src_text)
+        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
--- a/tests/test_modeling_tf_mbart.py
+++ b/tests/test_modeling_tf_mbart.py
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+from tests.test_configuration_common import ConfigTester
+from tests.test_modeling_tf_bart import TFBartModelTester
+from tests.test_modeling_tf_common import TFModelTesterMixin
+from transformers import AutoTokenizer, MBartConfig, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import is_pt_tf_cross_test, require_sentencepiece, require_tf, require_tokenizers, slow
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModelForSeq2SeqLM, TFMBartForConditionalGeneration
+class ModelTester(TFBartModelTester):
+    config_updates = dict(normalize_before=True, add_final_layer_norm=True)
+    config_cls = MBartConfig
+@require_tf
+class TestTFMBartCommon(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
+    all_generative_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
+    model_tester_cls = ModelTester
+    is_encoder_decoder = True
+    test_pruning = False
+    def setUp(self):
+        self.model_tester = self.model_tester_cls(self)
+        self.config_tester = ConfigTester(self, config_class=MBartConfig)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_inputs_embeds(self):
+        # inputs_embeds not supported
+        pass
+    def test_saved_model_with_hidden_states_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_saved_model_with_attentions_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pretrained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+@is_pt_tf_cross_test
+@require_sentencepiece
+@require_tokenizers
+class TestMBartEnRO(unittest.TestCase):
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+    ]
+    expected_text = [
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
+    ]
+    model_name = "facebook/mbart-large-en-ro"
+    @cached_property
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.model_name)
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name, from_pt=True)
+        return model
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        self.assertListEqual(self.expected_text, generated_words)
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer.prepare_seq2seq_batch(
+            src_texts=self.src_text, **tokenizer_kwargs, return_tensors="tf"
+        )
+        generated_ids = self.model.generate(
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        return generated_words
+    @slow
+    def test_batch_generation_en_ro(self):
+        self._assert_generated_batch_equal_expected()
--- a/tests/test_modeling_tf_pegasus.py
+++ b/tests/test_modeling_tf_pegasus.py
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+from transformers import AutoTokenizer, PegasusConfig, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import is_pt_tf_cross_test, require_sentencepiece, require_tf, require_tokenizers, slow
+from .test_configuration_common import ConfigTester
+from .test_modeling_pegasus import PGE_ARTICLE, XSUM_ENTRY_LONGER
+from .test_modeling_tf_bart import TFBartModelTester
+from .test_modeling_tf_common import TFModelTesterMixin
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModelForSeq2SeqLM, TFPegasusForConditionalGeneration
+class ModelTester(TFBartModelTester):
+    config_updates = dict(
+        normalize_before=True,
+        static_position_embeddings=True,
+    )
+    hidden_act = "relu"
+    config_cls = PegasusConfig
+@require_tf
+class TestTFPegasusCommon(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
+    all_generative_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
+    model_tester_cls = ModelTester
+    is_encoder_decoder = True
+    test_pruning = False
+    def setUp(self):
+        self.model_tester = self.model_tester_cls(self)
+        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_inputs_embeds(self):
+        # inputs_embeds not supported
+        pass
+    def test_saved_model_with_hidden_states_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_saved_model_with_attentions_output(self):
+        # Should be uncommented during patrick TF refactor
+        pass
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pretrained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+@is_pt_tf_cross_test
+@require_sentencepiece
+@require_tokenizers
+class TFPegasusIntegrationTests(unittest.TestCase):
+    src_text = [PGE_ARTICLE, XSUM_ENTRY_LONGER]
+    expected_text = [
+        "California's largest electricity provider has cut power to hundreds of thousands of customers in an effort to reduce the risk of wildfires.",
+        'N-Dubz have revealed they\'re "grateful" to have been nominated for four Mobo Awards.',
+    ]  # differs slightly from pytorch, likely due to numerical differences in linear layers
+    model_name = "google/pegasus-xsum"
+    @cached_property
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.model_name)
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name, from_pt=True)
+        return model
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        assert self.expected_text == generated_words
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer.prepare_seq2seq_batch(
+            src_texts=self.src_text, **tokenizer_kwargs, return_tensors="tf"
+        )
+        generated_ids = self.model.generate(
+            model_inputs.input_ids,
+            attention_mask=model_inputs.attention_mask,
+            num_beams=2,
+            use_cache=True,
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
+        return generated_words
+    @slow
+    def test_batch_generation(self):
+        self._assert_generated_batch_equal_expected()
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -67,6 +67,7 @@ MODEL_NAME_TO_DOC_FILE = {
    "xlm_prophetnet": "xlmprophetnet.rst",
    "xlm_roberta": "xlmroberta.rst",
    "bert_generation": "bertgeneration.rst",
+    "marian": "marian.rst",
 }
 # This is to make sure the transformers module imported is the one in the repo.
@@ -148,7 +149,6 @@ def get_model_doc_files():
    _ignore_modules = [
        "auto",
        "dialogpt",
-        "marian",
        "retribert",
    ]
    doc_files = []
@@ -245,6 +245,7 @@ def check_models_are_documented(module, doc_file):
 def _get_model_name(module):
    """ Get the model name for the module defining it."""
    splits = module.__name__.split("_")
    # Secial case for transfo_xl
    if splits[-1] == "xl":
        return "_".join(splits[-2:])