Tensorflow improvements (#4530)

* Better None gradients handling * Apply Style * Apply Style * Create a loss class per task to compute its respective loss * Add loss classes to the ALBERT TF models * Add loss classes to the BERT TF models * Add question answering and multiple choice to TF Camembert * Remove prints * Add multiple choice model to TF DistilBERT + loss computation * Add question answering model to TF Electra + loss computation * Add token classification, question answering and multiple choice models to TF Flaubert * Add multiple choice model to TF Roberta + loss computation * Add multiple choice model to TF XLM + loss computation * Add multiple choice and question answering models to TF XLM-Roberta * Add multiple choice model to TF XLNet + loss computation * Remove unused parameters * Add task loss classes * Reorder TF imports + add new model classes * Add new model classes * Bugfix in TF T5 model * Bugfix for TF T5 tests * Bugfix in TF T5 model * Fix TF T5 model tests * Fix T5 tests + some renaming * Fix inheritance issue in the AutoX tests * Add tests for TF Flaubert and TF XLM Roberta * Add tests for TF Flaubert and TF XLM Roberta * Remove unused piece of code in the TF trainer * bugfix and remove unused code * Bugfix for TF 2.2 * Apply Style * Divide TFSequenceClassificationAndMultipleChoiceLoss into their two respective name * Apply style * Mirror the PT Trainer in the TF one: fp16, optimizers and tb_writer as class parameter and better dataset handling * Fix TF optimizations tests and apply style * Remove useless parameter * Bugfix and apply style * Fix TF Trainer prediction * Now the TF models return the loss such as their PyTorch couterparts * Apply Style * Ignore some tests output * Take into account the SQuAD cls_index, p_mask and is_impossible parameters for the QuestionAnswering task models. * Fix names for SQuAD data * Apply Style * Fix conflicts with 2.11 release * Fix conflicts with 2.11 * Fix wrongname * Add better documentation on the new create_optimizer function * Fix isort * logging_dir: use same default as PyTorch Co-authored-by: Julien Chaumond <chaumond@gmail.com>

Tensorflow improvements (#4530)
* Better None gradients handling * Apply Style * Apply Style * Create a loss class per task to compute its respective loss * Add loss classes to the ALBERT TF models * Add loss classes to the BERT TF models * Add question answering and multiple choice to TF Camembert * Remove prints * Add multiple choice model to TF DistilBERT + loss computation * Add question answering model to TF Electra + loss computation * Add token classification, question answering and multiple choice models to TF Flaubert * Add multiple choice model to TF Roberta + loss computation * Add multiple choice model to TF XLM + loss computation * Add multiple choice and question answering models to TF XLM-Roberta * Add multiple choice model to TF XLNet + loss computation * Remove unused parameters * Add task loss classes * Reorder TF imports + add new model classes * Add new model classes * Bugfix in TF T5 model * Bugfix for TF T5 tests * Bugfix in TF T5 model * Fix TF T5 model tests * Fix T5 tests + some renaming * Fix inheritance issue in the AutoX tests * Add tests for TF Flaubert and TF XLM Roberta * Add tests for TF Flaubert and TF XLM Roberta * Remove unused piece of code in the TF trainer * bugfix and remove unused code * Bugfix for TF 2.2 * Apply Style * Divide TFSequenceClassificationAndMultipleChoiceLoss into their two respective name * Apply style * Mirror the PT Trainer in the TF one: fp16, optimizers and tb_writer as class parameter and better dataset handling * Fix TF optimizations tests and apply style * Remove useless parameter * Bugfix and apply style * Fix TF Trainer prediction * Now the TF models return the loss such as their PyTorch couterparts * Apply Style * Ignore some tests output * Take into account the SQuAD cls_index, p_mask and is_impossible parameters for the QuestionAnswering task models. * Fix names for SQuAD data * Apply Style * Fix conflicts with 2.11 release * Fix conflicts with 2.11 * Fix wrongname * Add better documentation on the new create_optimizer function * Fix isort * logging_dir: use same default as PyTorch Co-authored-by: Julien Chaumond <chaumond@gmail.com>
f9414f75 · Julien Plu · GitHub · ccd26c28 · f9414f75 · f9414f75
Unverified Commit f9414f75 authored Jun 05, 2020 by Julien Plu Committed by GitHub Jun 04, 2020
7 changed files
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -3,12 +3,12 @@
 import logging
 import math
 import os
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, Tuple
 import numpy as np
 import tensorflow as tf
-from .modeling_tf_utils import TFPreTrainedModel, shape_list
+from .modeling_tf_utils import TFPreTrainedModel
 from .optimization_tf import GradientAccumulator, create_optimizer
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput
 from .training_args_tf import TFTrainingArguments
@@ -20,13 +20,14 @@ logger = logging.getLogger(__name__)
 class TFTrainer:
    model: TFPreTrainedModel
    args: TFTrainingArguments
-    # something similar to a PT Dataset.
-    # This is just temporary before to have
-    # a framework-agnostic approach for datasets.
    train_dataset: Optional[tf.data.Dataset]
    eval_dataset: Optional[tf.data.Dataset]
    compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
    prediction_loss_only: bool
+    tb_writer: Optional[tf.summary.SummaryWriter] = None
+    optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None
+    global_step: Optional[int] = None
+    epoch: Optional[float] = None
    def __init__(
        self,
@@ -36,6 +37,8 @@ class TFTrainer:
        eval_dataset: Optional[tf.data.Dataset] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        prediction_loss_only=False,
+        tb_writer: Optional[tf.summary.SummaryWriter] = None,
+        optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None,
    ):
        self.model = model
        self.args = args
@@ -43,120 +46,73 @@ class TFTrainer:
        self.eval_dataset = eval_dataset
        self.compute_metrics = compute_metrics
        self.prediction_loss_only = prediction_loss_only
+        self.optimizers = optimizers
        self.gradient_accumulator = GradientAccumulator()
-        self._setup_training()
+        if tb_writer is not None:
+            self.tb_writer = tb_writer
+        else:
+            self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir)
-    def _setup_training(self) -> None:
+    def get_train_tfdataset(self) -> tf.data.Dataset:
-        """
+        if self.train_dataset is None:
-        Setup the different steps to train a model:
+            raise ValueError("Trainer: training requires a train_dataset.")
-          - check if all the data are given
-          - create the proper strategy
-          - create the features
-          - prepare the model settings
-        """
-        self._prepare_dataset()
-        with self.args.strategy.scope():
+        self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
-            self._create_optimizer()
-            _ = self.optimizer.iterations
-            self._set_loss_and_metric()
-            self._create_checkpoint_manager()
-            self._create_summary_writer()
-    def _set_loss_and_metric(self) -> None:
+        if self.args.max_steps > 0:
-        """
+            self.train_steps = self.args.max_steps
-        Create the training loss and metric with their name. Allowed names are those listed
+        else:
-        in the Tensorflow documentation and those contained in the transformers library.
+            self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
-        """
-        try:
-            self.loss = tf.keras.losses.get(
-                {
-                    "class_name": self.args.loss_name,
-                    "config": {"from_logits": True, "reduction": tf.keras.losses.Reduction.NONE},
-                }
-            )
-        except TypeError:
-            self.loss = tf.keras.losses.get(
-                {"class_name": self.args.loss_name, "config": {"reduction": tf.keras.losses.Reduction.NONE}}
-            )
-    def _create_summary_writer(self) -> None:
-        """
-        Create a summary writer to be able to read the logs in Tensorboard.
-        """
-        self.writer = tf.summary.create_file_writer(self.args.logging_dir)
-    def _prepare_dataset(self) -> None:
+        ds = (
-        """
+            self.train_dataset.cache()
-        Prepare the training, validation and test data.
+            .shuffle(self.num_train_examples)
-        """
+            .batch(self.args.train_batch_size)
-        if self.train_dataset is not None:
+            .prefetch(tf.data.experimental.AUTOTUNE)
-            self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
+        )
-            if self.args.max_steps > 0:
+        if self.args.max_steps > 0:
-                self.train_steps = self.args.max_steps
+            self.train_dataset = self.train_dataset.repeat(-1)
-            else:
-                self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
-            self.train_dataset = (
+        return self.args.strategy.experimental_distribute_dataset(ds)
-                self.train_dataset.cache()
-                .shuffle(self.num_train_examples)
-                .batch(self.args.train_batch_size)
-                .prefetch(tf.data.experimental.AUTOTUNE)
-            )
-            if self.args.max_steps > 0:
+    def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
-                self.train_dataset = self.train_dataset.repeat(-1)
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
-            self.train_dataset = self.args.strategy.experimental_distribute_dataset(self.train_dataset)
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-        else:
+        ds = eval_dataset.cache().batch(self.args.eval_batch_size).prefetch(tf.data.experimental.AUTOTUNE)
-            self.train_steps = 0
-        if self.eval_dataset is not None:
+        return self.args.strategy.experimental_distribute_dataset(ds)
-            self.eval_dataset = (
-                self.eval_dataset.batch(self.args.eval_batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
-            )
-            self.eval_dataset = self.args.strategy.experimental_distribute_dataset(self.eval_dataset)
-    def _create_optimizer(self) -> None:
+    def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
-        """
+        ds = test_dataset.batch(self.args.eval_batch_size)
-        Create the training optimizer with its name. Allowed names are those listed
-        in the Tensorflow documentation and those contained in the transformers library.
-        """
-        if self.args.optimizer_name == "adamw":
-            self.optimizer = create_optimizer(
-                self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr
-            )
-        else:
-            try:
-                self.optimizer = tf.keras.optimizers.get(
-                    {
-                        "class_name": self.args.optimizer_name,
-                        "config": {"learning_rate": self.args.learning_rate, "epsilon": self.args.adam_epsilon},
-                    }
-                )
-            except TypeError:
-                # This is for the case where the optimizer is not Adam-like such as SGD
-                self.optimizer = tf.keras.optimizers.get(
-                    {"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}}
-                )
-        logger.info("Created an/a {} optimizer".format(self.args.optimizer_name))
-    def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None:
+        return self.args.strategy.experimental_distribute_dataset(ds)
-        """
-        Create a checkpoint manager in order to be able to make the training
+    def get_optimizers(
-        fault-tolerant.
+        self,
-        Args:
+    ) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]:
-          max_to_keep: the maximum number of checkpoints to keep in the checkpoint path.
-          load_model: if we want to start the training from the latest checkpoint.
        """
-        ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model)
+        Setup the optimizer and the learning rate scheduler.
-        self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep)
+        We provide a reasonable default that works well.
+        If you want to use something else, you can pass a tuple in the Trainer's init,
+        or override this method in a subclass.
+        """
+        if self.optimizers is not None:
+            return self.optimizers
+        optimizer, scheduler = create_optimizer(
+            self.args.learning_rate,
+            self.train_steps,
+            self.args.warmup_steps,
+            adam_epsilon=self.args.adam_epsilon,
+            weight_decay_rate=self.args.weight_decay,
+        )
-        if load_model:
+        return optimizer, scheduler
-            ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
    @tf.function
    def _evaluate_steps(self, per_replica_features, per_replica_labels):
@@ -182,6 +138,14 @@ class TFTrainer:
    def _prediction_loop(
        self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None
    ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
+        Works both with or without labels.
+        """
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
        logger.info("***** Running %s *****", description)
        logger.info("  Batch size = %d", self.args.eval_batch_size)
@@ -196,6 +160,12 @@ class TFTrainer:
            loss = tf.reduce_mean(loss)
            if not prediction_loss_only:
+                if isinstance(logits, tuple):
+                    logits = logits[0]
+                if isinstance(labels, tuple):
+                    labels = labels[0]
                if self.args.n_gpu > 1:
                    for val in logits.values:
                        if preds is None:
@@ -240,10 +210,9 @@ class TFTrainer:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
        """
-        if eval_dataset is None:
+        eval_ds = self.get_eval_tfdataset(eval_dataset)
-            eval_dataset = self.eval_dataset
-        output = self._prediction_loop(eval_dataset, description="Evaluation")
+        output = self._prediction_loop(eval_ds, description="Evaluation")
        return output.metrics
@@ -251,12 +220,25 @@ class TFTrainer:
        """
        Train method to train the model.
        """
+        train_ds = self.get_train_tfdataset()
        if self.args.debug:
            tf.summary.trace_on(graph=True, profiler=True)
        self.gradient_accumulator.reset()
-        iterations = self.optimizer.iterations
+        with self.args.strategy.scope():
+            optimizer, lr_scheduler = self.get_optimizers()
+            iterations = optimizer.iterations
+            ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model)
+            self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=5)
+            if self.model.ckpt_manager.latest_checkpoint:
+                logger.info(
+                    "Checkpoint file %s found and restoring from checkpoint", self.model.ckpt_manager.latest_checkpoint
+                )
+                ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
        if iterations.numpy() > 0:
            logger.info("Start the training from the last checkpoint")
@@ -268,21 +250,30 @@ class TFTrainer:
        epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs
+        if self.args.fp16:
+            policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
+            tf.keras.mixed_precision.experimental.set_policy(policy)
+        with self.tb_writer.as_default():
+            tf.summary.text("args", self.args.to_json_string())
+        self.tb_writer.flush()
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", self.num_train_examples)
        logger.info("  Num Epochs = %d", epochs)
        logger.info("  Total optimization steps = %d", self.train_steps)
        for epoch in range(start_epoch, int(epochs + 1)):
-            for training_loss in self._training_steps():
+            for training_loss in self._training_steps(train_ds, optimizer):
                step = iterations.numpy()
                if self.args.debug:
-                    with self.writer.as_default():
+                    with self.tb_writer.as_default():
                        tf.summary.scalar("loss", training_loss, step=step)
                if step == 1 and self.args.debug:
-                    with self.writer.as_default():
+                    with self.tb_writer.as_default():
                        tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir)
                if self.args.evaluate_during_training and step % self.args.eval_steps == 0:
@@ -293,17 +284,16 @@ class TFTrainer:
                        eval_key = "eval_{}".format(key)
                        logs[eval_key] = value
-                    if callable(self.optimizer.learning_rate):
+                    logs["learning_rate"] = lr_scheduler(step).numpy()
-                        logs["learning_rate"] = self.optimizer.learning_rate(step).numpy()
-                    else:
-                        logs["learning_rate"] = self.optimizer.learning_rate.numpy()
                    logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs))
-                    with self.writer.as_default():
+                    with self.tb_writer.as_default():
                        for k, v in logs.items():
                            tf.summary.scalar(k, v, step=step)
+                    self.tb_writer.flush()
                if step % self.args.logging_steps == 0:
                    logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy()))
@@ -314,21 +304,21 @@ class TFTrainer:
                if step % self.train_steps == 0:
                    break
-    def _training_steps(self):
+    def _training_steps(self, ds, optimizer):
        """
        Returns a generator over training steps (i.e. parameters update).
        """
-        for i, loss in enumerate(self._accumulate_next_gradients()):
+        for i, loss in enumerate(self._accumulate_next_gradients(ds)):
            if i % self.args.gradient_accumulation_steps == 0:
-                self._apply_gradients()
+                self._apply_gradients(optimizer)
                yield loss
    @tf.function
-    def _apply_gradients(self):
+    def _apply_gradients(self, optimizer):
        """Applies the gradients (cross-replica)."""
-        self.args.strategy.experimental_run_v2(self._step)
+        self.args.strategy.experimental_run_v2(self._step, args=(optimizer,))
-    def _step(self):
+    def _step(self, optimizer):
        """Applies gradients and resets accumulation."""
        gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync
        gradients = [
@@ -336,12 +326,12 @@ class TFTrainer:
        ]
        gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients]
-        self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
+        optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
        self.gradient_accumulator.reset()
-    def _accumulate_next_gradients(self):
+    def _accumulate_next_gradients(self, ds):
        """Accumulates the gradients from the next element in dataset."""
-        iterator = iter(self.train_dataset)
+        iterator = iter(ds)
        @tf.function
        def _accumulate_next():
@@ -388,23 +378,10 @@ class TFTrainer:
          labels: the batched labels.
          training: run the model in training mode or not
        """
-        if self.args.mode == "text-classification" or self.args.mode == "token-classification":
+        if isinstance(labels, (dict)):
-            logits = self.model(features, training=training)[0]
+            loss, logits = self.model(features, training=training, **labels)[:2]
-        else:
-            logits = self.model(features, training=training)
-        if self.args.mode == "token-classification":
-            active_loss = tf.reshape(labels, (-1,)) != -1
-            reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
-            labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
-            loss = self.loss(labels, reduced_logits)
-        elif self.args.mode == "question-answering":
-            start_loss = self.loss(labels["start_position"], logits[0])
-            end_loss = self.loss(labels["end_position"], logits[1])
-            loss = (start_loss + end_loss) / 2.0
        else:
-            loss = self.loss(labels, logits)
+            loss, logits = self.model(features, labels=labels, training=training)[:2]
        loss += sum(self.model.losses) * (1.0 / self.args.n_gpu)
        return loss, logits
@@ -418,19 +395,24 @@ class TFTrainer:
          test_dataset: something similar to a PT Dataset. This is just
            temporary before to have a framework-agnostic approach for datasets.
        """
-        test_dataset = test_dataset.batch(self.args.eval_batch_size)
+        test_ds = self.get_test_tfdataset(test_dataset)
-        test_dataset = self.args.strategy.experimental_distribute_dataset(test_dataset)
-        return self._prediction_loop(test_dataset, description="Prediction")
+        return self._prediction_loop(test_ds, description="Prediction")
-    def save_model(self) -> None:
+    def save_model(self, output_dir: Optional[str] = None):
        """
        Save the pretrained model and create a Tensorflow saved model.
        """
-        logger.info("Saving model in {}".format(self.args.output_dir))
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        logger.info("Saving model in {}".format(output_dir))
        path = os.path.join(self.args.output_dir, "saved_model")
        logger.info("Saving model in {}".format(path))
        os.makedirs(path, exist_ok=True)
+        if not isinstance(self.model, TFPreTrainedModel):
+            raise ValueError("Trainer.model appears to not be a PreTrainedModel")
        self.model.save_pretrained(self.args.output_dir)
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
 import dataclasses
 import json
 import logging
+import os
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Tuple
@@ -27,6 +28,17 @@ def is_tpu_available():
 logger = logging.getLogger(__name__)
+def default_logdir() -> str:
+    """
+    Same default as PyTorch
+    """
+    import socket
+    from datetime import datetime
+    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
+    return os.path.join("runs", current_time + "_" + socket.gethostname())
 @dataclass
 class TrainingArguments:
    """
@@ -97,7 +109,7 @@ class TrainingArguments:
    )
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
-    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
+    logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})

--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -14,28 +14,9 @@ if is_tf_available():
 @dataclass
 class TFTrainingArguments(TrainingArguments):
-    optimizer_name: str = field(
-        default="adam",
-        metadata={
-            "help": 'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"'
-        },
-    )
-    mode: str = field(
-        default="text-classification",
-        metadata={"help": 'Type of task, one of "text-classification", "token-classification", "question-answering"'},
-    )
-    loss_name: str = field(
-        default="SparseCategoricalCrossentropy",
-        metadata={
-            "help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses"
-        },
-    )
    tpu_name: str = field(
        default=None, metadata={"help": "Name of TPU"},
    )
-    end_lr: float = field(
-        default=0, metadata={"help": "End learning rate for optimizer"},
-    )
    eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."})
    debug: bool = field(
        default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"}

--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -30,7 +30,7 @@ if is_tf_available():
    import tensorflow as tf
    import numpy as np
-    from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding
+    from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding, TFSharedEmbeddings
    if _tf_gpu_memory_limit is not None:
        gpus = tf.config.list_physical_devices("GPU")
@@ -107,26 +107,45 @@ class TFModelTesterMixin:
            and getattr(module_member, "_keras_serializable", False)
        )
        for main_layer_class in tf_main_layer_classes:
-            main_layer = main_layer_class(config)
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
            symbolic_inputs = {
                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
            }
            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
            outputs = model(inputs_dict)
            with tempfile.TemporaryDirectory() as tmpdirname:
                filepath = os.path.join(tmpdirname, "keras_model.h5")
                model.save(filepath)
-                model = tf.keras.models.load_model(
+                if "T5" in main_layer_class.__name__:
-                    filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    model = tf.keras.models.load_model(
-                )
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    )
                assert isinstance(model, tf.keras.Model)
                after_outputs = model(inputs_dict)
                self.assert_outputs_same(after_outputs, outputs)
    def assert_outputs_same(self, after_outputs, outputs):
        # Make sure we don't have nans
-        out_1 = after_outputs[0].numpy()
+        if isinstance(after_outputs, tf.Tensor):
+            out_1 = after_outputs.numpy()
+        else:
+            out_1 = after_outputs[0].numpy()
        out_2 = outputs[0].numpy()
        self.assertEqual(out_1.shape, out_2.shape)
        out_1 = out_1[~np.isnan(out_1)]
@@ -269,7 +288,6 @@ class TFModelTesterMixin:
            inputs_keywords = copy.deepcopy(inputs_dict)
            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,)
            outputs_keywords = model(input_ids, **inputs_keywords)
            output_dict = outputs_dict[0].numpy()
            output_keywords = outputs_keywords[0].numpy()

--- a/tests/test_modeling_tf_flaubert.py
+++ b/tests/test_modeling_tf_flaubert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers import is_tf_available
+from .utils import require_tf, slow
+if is_tf_available():
+    import tensorflow as tf
+    import numpy as np
+    from transformers import TFFlaubertModel
+@require_tf
+class TFFlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
+        input_ids = tf.convert_to_tensor(
+            [[0, 158, 735, 2592, 1424, 6727, 82, 1]], dtype=tf.int32,
+        )  # "J'aime flaubert !"
+        output = model(input_ids)[0]
+        expected_shape = tf.TensorShape((1, 8, 512))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [
+                [
+                    [-1.8768773, -1.566555, 0.27072418],
+                    [-1.6920038, -0.5873505, 1.9329599],
+                    [-2.9563985, -1.6993835, 1.7972052],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
--- a/tests/test_modeling_tf_xlm_roberta.py
+++ b/tests/test_modeling_tf_xlm_roberta.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers import is_tf_available
+from .utils import require_tf, slow
+if is_tf_available():
+    import tensorflow as tf
+    import numpy as np
+    from transformers import TFXLMRobertaModel
+@require_tf
+class TFFlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
+        features = {
+            "input_ids": tf.convert_to_tensor([[0, 2646, 10269, 83, 99942, 2]], dtype=tf.int32),  # "My dog is cute"
+            "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
+        }
+        output = model(features)[0]
+        expected_shape = tf.TensorShape((1, 6, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [
+                [
+                    [0.0681762, 0.10894451, 0.06772504],
+                    [-0.06423668, 0.02366615, 0.04329344],
+                    [-0.06057295, 0.09974135, -0.00070584],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
--- a/tests/test_optimization_tf.py
+++ b/tests/test_optimization_tf.py
@@ -47,7 +47,7 @@ class OptimizationFTest(unittest.TestCase):
        with strategy.scope():
            accumulator = GradientAccumulator()
            variable = tf.Variable([4.0, 3.0])
-            optimizer = create_optimizer(5e-5, 10, 5)
+            optimizer, _ = create_optimizer(5e-5, 10, 5)
            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
        def accumulate_on_replica(gradient):