Merge branch 'master' into cleanup-configs

a52d56c8 · Thomas Wolf · GitHub · 8ade2040 · e92bcb7e · a52d56c8
Unverified Commit a52d56c8 authored Dec 14, 2019 by Thomas Wolf Committed by GitHub Dec 14, 2019
20 changed files
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -131,8 +131,9 @@ class HfApi:
        # the client still has to specify it when uploading the file.
        with open(filepath, "rb") as f:
            pf = TqdmProgressFileReader(f)
+            data = f if pf.total_size > 0 else ""
-            r = requests.put(urls.write, data=f, headers={
+            r = requests.put(urls.write, data=data, headers={
                "content-type": urls.type,
            })
            r.raise_for_status()

--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -29,6 +29,7 @@ from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequen
 from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
 from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
 from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
+from .modeling_t5 import T5Model, T5WithLMHeadModel
 from .modeling_utils import PreTrainedModel, SequenceSummary
@@ -49,6 +50,7 @@ class AutoModel(object):
        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
            - contains `distilbert`: DistilBertModel (DistilBERT model)
            - contains `albert`: AlbertModel (ALBERT model)
            - contains `camembert`: CamembertModel (CamemBERT model)
@@ -74,6 +76,7 @@ class AutoModel(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
            - contains `distilbert`: DistilBertModel (DistilBERT model)
            - contains `albert`: AlbertModel (ALBERT model)
            - contains `camembert`: CamembertModel (CamemBERT model)
@@ -146,7 +149,9 @@ class AutoModel(object):
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -185,6 +190,7 @@ class AutoModelWithLMHead(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
            - contains `albert`: AlbertForMaskedLM (ALBERT model)
            - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
@@ -213,6 +219,7 @@ class AutoModelWithLMHead(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
            - contains `albert`: AlbertForMaskedLM (ALBERT model)
            - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
@@ -284,7 +291,9 @@ class AutoModelWithLMHead(object):
            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -219,9 +219,7 @@ class PreTrainedEncoderDecoder(nn.Module):
        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
        if encoder_hidden_states is None:
            encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[
+            encoder_hidden_states = encoder_outputs[0]
-                0
-            ]  # output the last layer hidden state
        else:
            encoder_outputs = ()

--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC
 from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
 from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
 from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
+from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
 from .file_utils import add_start_docstrings
@@ -45,6 +46,7 @@ class TFAutoModel(object):
        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFBertModel (Bert model)
@@ -68,6 +70,7 @@ class TFAutoModel(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFTFBertModel (Bert model)
@@ -137,7 +140,9 @@ class TFAutoModel(object):
            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -173,6 +178,7 @@ class TFAutoModelWithLMHead(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -199,6 +205,7 @@ class TFAutoModelWithLMHead(object):
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -269,7 +276,9 @@ class TFAutoModelWithLMHead(object):
            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
    logger.info("Loading PyTorch weights from {}".format(pt_path))
    pt_state_dict = torch.load(pt_path, map_location='cpu')
+    logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
@@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
        start_prefix_to_remove = tf_model.base_model_prefix + '.'
    symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
+    tf_loaded_numel = 0
    weight_value_tuples = []
    all_pytorch_weights = set(list(pt_state_dict.keys()))
    for symbolic_weight in symbolic_weights:
@@ -159,7 +160,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
            e.args += (symbolic_weight.shape, array.shape)
            raise e
-        logger.info("Initialize TF weight {}".format(symbolic_weight.name))
+        tf_loaded_numel += array.size
+        # logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
        weight_value_tuples.append((symbolic_weight, array))
        all_pytorch_weights.discard(name)
@@ -169,6 +171,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
+    logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
    logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
    return tf_model
@@ -272,7 +276,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
            e.args += (pt_weight.shape, array.shape)
            raise e
-        logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
+        # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))
        new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
        loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)

--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -24,14 +24,12 @@ import os
 import tensorflow as tf
 from .configuration_utils import PretrainedConfig
-from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
                         cached_path, hf_bucket_url, is_remote_url)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 class TFPreTrainedModel(tf.keras.Model):
    r""" Base class for all TF models.
@@ -60,7 +58,7 @@ class TFPreTrainedModel(tf.keras.Model):
        Returns:
            tf.Tensor with dummy inputs
        """
-        return tf.constant(DUMMY_INPUTS)
+        return {'input_ids': tf.constant(DUMMY_INPUTS)}
    def __init__(self, config, *inputs, **kwargs):
        super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)

--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
            langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
        else:
            langs_list = None
-        return [inputs_list, attns_list, langs_list]
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in

--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,12 +31,11 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from .configuration_utils import PretrainedConfig
-from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
                         cached_path, hf_bucket_url, is_remote_url)
 logger = logging.getLogger(__name__)
 try:
    from torch.nn import Identity
 except ImportError:
@@ -72,6 +71,15 @@ class PreTrainedModel(nn.Module):
    load_tf_weights = lambda model, config, path: None
    base_model_prefix = ""
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to do a forward pass in the network.
+        Returns:
+            torch.Tensor with dummy inputs
+        """
+        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedModel, self).__init__()
        if not isinstance(config, PretrainedConfig):
@@ -161,8 +169,7 @@ class PreTrainedModel(nn.Module):
        base_model.vocab_size = new_num_tokens
        # Tie weights again if needed
-        if hasattr(self, 'tie_weights'):
+        self.tie_weights()
-            self.tie_weights()
        return model_embeds
@@ -478,8 +485,7 @@ class PreTrainedModel(nn.Module):
                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
-        if hasattr(model, 'tie_weights'):
+        model.tie_weights()  # make sure word embedding weights are still tied if needed
-            model.tie_weights()  # make sure word embedding weights are still tied
        # Set model in evaluation mode to desactivate DropOut modules by default
        model.eval()

--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
    def __init__(self, *inputs, **kwargs):
        super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
    def _init_weights(self, module):
        """ Initialize the weights. """
        if isinstance(module, nn.Embedding):
@@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                               langs=langs,
                                               token_type_ids=token_type_ids,
                                               position_ids=position_ids,
-                                               lengths=lengths, 
+                                               lengths=lengths,
                                               cache=cache,
                                               head_mask=head_mask,
                                               inputs_embeds=inputs_embeds)

--- a/transformers/tests/fixtures/empty.txt
+++ b/transformers/tests/fixtures/empty.txt
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -15,18 +15,30 @@
 from __future__ import absolute_import, division, print_function
 import os
-import six
 import time
 import unittest
-from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+import requests
+import six
+from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
 USER = "__DUMMY_TRANSFORMERS_USER__"
 PASS = "__DUMMY_TRANSFORMERS_PASS__"
-FILE_KEY = "Test-{}.txt".format(int(time.time()))
+FILES = [
-FILE_PATH = os.path.join(
+    (
-    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        "Test-{}.txt".format(int(time.time())),
-)
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        )
+    ),
+    (
+        "yoyo {}.txt".format(int(time.time())), # space is intentional
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
+        )
+    ),
+]
@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
        self.assertEqual(user, USER)
    def test_presign(self):
-        urls = self._api.presign(token=self._token, filename=FILE_KEY)
+        for FILE_KEY, FILE_PATH in FILES:
-        self.assertIsInstance(urls, PresignedUrl)
+            urls = self._api.presign(token=self._token, filename=FILE_KEY)
-        self.assertEqual(urls.type, "text/plain")
+            self.assertIsInstance(urls, PresignedUrl)
+            self.assertEqual(urls.type, "text/plain")
    def test_presign_and_upload(self):
-        access_url = self._api.presign_and_upload(
+        for FILE_KEY, FILE_PATH in FILES:
-            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+            access_url = self._api.presign_and_upload(
-        )
+                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-        self.assertIsInstance(access_url, six.string_types)
+            )
+            self.assertIsInstance(access_url, six.string_types)
+            with open(FILE_PATH, 'r') as f:
+                body = f.read()
+            r = requests.get(access_url)
+            self.assertEqual(r.text, body)
    def test_list_objs(self):
        objs = self._api.list_objs(token=self._token)

--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -58,7 +58,7 @@ else:
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if '_range' in key or '_std' in key or 'initializer_factor' in key:
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
@@ -73,6 +73,7 @@ class CommonTestCases:
        test_pruning = True
        test_resize_embeddings = True
        test_head_masking = True
+        is_encoder_decoder = False
        def test_save_load(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -83,6 +84,8 @@ class CommonTestCases:
                model.eval()
                with torch.no_grad():
                    outputs = model(**inputs_dict)
+                out_2 = outputs[0].numpy()
+                out_2[np.isnan(out_2)] = 0
                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
@@ -93,9 +96,7 @@ class CommonTestCases:
                    # Make sure we don't have nans
                    out_1 = after_outputs[0].cpu().numpy()
-                    out_2 = outputs[0].cpu().numpy()
+                    out_1[np.isnan(out_1)] = 0
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
                    max_diff = np.amax(np.abs(out_1 - out_2))
                    self.assertLessEqual(max_diff, 1e-5)
@@ -117,20 +118,32 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
+                with torch.no_grad():
-                self.assertEqual(first.ne(second).sum().item(), 0)
+                    first = model(**inputs_dict)[0]
+                    second = model(**inputs_dict)[0]
+                out_1 = first.cpu().numpy()
+                out_2 = second.cpu().numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
@@ -138,28 +151,42 @@ class CommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length ,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
                out_len = len(outputs)
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length
+                         ])
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
-                self.assertEqual(out_len+1, len(outputs))
+                    outputs = model(**inputs_dict)
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
-                attentions = outputs[-1]
+                self_attentions = outputs[-1]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -223,7 +250,6 @@ class CommonTestCases:
                self.assertTrue(models_equal)
        def test_headmasking(self):
            if not self.test_head_masking:
                return
@@ -278,7 +304,6 @@ class CommonTestCases:
                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
        def test_head_pruning(self):
            if not self.test_pruning:
                return
@@ -297,7 +322,8 @@ class CommonTestCases:
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -333,7 +359,8 @@ class CommonTestCases:
                model = model_class.from_pretrained(directory)
                model.to(torch_device)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
@@ -362,7 +389,8 @@ class CommonTestCases:
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
@@ -389,7 +417,8 @@ class CommonTestCases:
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -406,7 +435,8 @@ class CommonTestCases:
                model.to(torch_device)
                shutil.rmtree(directory)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -417,7 +447,8 @@ class CommonTestCases:
                heads_to_prune = {0: [0], 2: [1, 2]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
@@ -427,7 +458,6 @@ class CommonTestCases:
                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -437,14 +467,16 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
                self.assertEqual(model.config.output_attentions, False)
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                     self.model_tester.hidden_size])
        def test_resize_tokens_embeddings(self):
            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -550,8 +582,14 @@ class CommonTestCases:
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
-            del inputs_dict["input_ids"]
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
            for model_class in self.all_model_classes:
                model = model_class(config)
@@ -559,9 +597,14 @@ class CommonTestCases:
                model.eval()
                wte = model.get_input_embeddings()
-                inputs_dict["inputs_embeds"] = wte(input_ids)
+                if not self.is_encoder_decoder:
-                outputs = model(**inputs_dict)
+                    inputs_dict["inputs_embeds"] = wte(input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
    class GPTModelTester(CommonModelTester):
@@ -649,9 +692,10 @@ class CommonTestCases:
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids)
+            with torch.no_grad():
-            outputs = model(input_ids, position_ids)
+                outputs = model(input_ids, position_ids, token_type_ids)
-            outputs = model(input_ids)
+                outputs = model(input_ids, position_ids)
+                outputs = model(input_ids)
            hidden_state = outputs[0]
            self.parent.assertListEqual(
@@ -664,7 +708,8 @@ class CommonTestCases:
            model = self.lm_head_model_class(config)
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]
            total_voc = self.vocab_size
@@ -681,7 +726,8 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(input_ids)
+                with torch.no_grad():
+                    outputs = model(input_ids)
                presents = outputs[-1]
                self.parent.assertEqual(self.num_hidden_layers, len(presents))
                self.parent.assertListEqual(
@@ -694,7 +740,8 @@ class CommonTestCases:
            model = self.double_head_model_class(config)
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+            with torch.no_grad():
+                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
            loss = [lm_loss, mc_loss]

--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+from transformers import is_torch_available
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
+@require_torch
+class T5ModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+    class T5ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     encoder_seq_length=7,
+                     decoder_seq_length=9,
+                     is_training=True,
+                     use_attention_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.encoder_seq_length = encoder_seq_length
+            self.decoder_seq_length = decoder_seq_length
+            self.is_training = is_training
+            self.use_attention_mask = use_attention_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+            encoder_attention_mask = None
+            decoder_attention_mask = None
+            if self.use_attention_mask:
+                encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+            decoder_lm_labels = None
+            if self.use_labels:
+                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+            config = T5Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5Model(config=config)
+            model.eval()
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids,
+                                                   encoder_attention_mask=encoder_attention_mask,
+                                                   decoder_attention_mask=decoder_attention_mask)
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids)
+            result = {
+                "encoder_output": encoder_output,
+                "decoder_output": decoder_output,
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].size()),
+                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].size()),
+                [self.batch_size, self.decoder_seq_length, self.hidden_size])
+        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5WithLMHeadModel(config=config)
+            model.eval()
+            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
+                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+            loss, prediction_scores = outputs[0], outputs[1]
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+            self.check_loss_output(result)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
+             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': encoder_input_ids,
+                           'decoder_input_ids': decoder_input_ids,
+                           'decoder_attention_mask': decoder_attention_mask,
+                           'encoder_attention_mask': encoder_attention_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = T5Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -69,6 +69,7 @@ class TFCommonTestCases:
        test_torchscript = True
        test_pruning = True
        test_resize_embeddings = True
+        is_encoder_decoder = False
        def test_initialization(self):
            pass
@@ -129,8 +130,12 @@ class TFCommonTestCases:
                                      for name, key in inputs_dict.items())
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict)
+                tfo = tf_model(inputs_dict, training=False)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tf_hidden_states = tfo[0].numpy()
+                pt_hidden_states = pto[0].numpy()
+                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
+                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
+                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
                self.assertLessEqual(max_diff, 2e-2)
                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
@@ -150,13 +155,21 @@ class TFCommonTestCases:
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
                tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tfo[0].numpy()
+                pto = pto[0].numpy()
+                tfo[np.isnan(tfo)] = 0
+                pto[np.isnan(pto)] = 0
+                max_diff = np.amax(np.abs(tfo - pto))
                self.assertLessEqual(max_diff, 2e-2)
        def test_compile_tf_model(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+            if self.is_encoder_decoder:
+                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
+                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+            else:
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
@@ -189,7 +202,7 @@ class TFCommonTestCases:
                outputs_dict = model(inputs_dict)
                inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids')
+                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
                outputs_keywords = model(input_ids, **inputs_keywords)
                output_dict = outputs_dict[0].numpy()
@@ -200,6 +213,11 @@ class TFCommonTestCases:
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
@@ -212,16 +230,28 @@ class TFCommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
                out_len = len(outputs)
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length])
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                outputs = model(inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
@@ -230,8 +260,8 @@ class TFCommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -264,35 +294,53 @@ class TFCommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-                self.assertTrue(tf.math.equal(first, second).numpy().all())
+                out_1 = first.numpy()
+                out_2 = second.numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+        def _get_embeds(self, wte, input_ids):
+            # ^^ In our TF models, the input_embeddings can take slightly different forms,
+            # so we try a few of them.
+            # We used to fall back to just synthetically creating a dummy tensor of ones:
+            try:
+                x = wte(input_ids, mode="embedding")
+            except:
+                try:
+                    x = wte([input_ids], mode="embedding")
+                except:
+                    try:
+                        x = wte([input_ids, None, None, None], mode="embedding")
+                    except:
+                        if hasattr(self.model_tester, "embedding_size"):
+                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        else:
+                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+            return x
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
-            del inputs_dict["input_ids"]
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
            for model_class in self.all_model_classes:
                model = model_class(config)
                wte = model.get_input_embeddings()
-                try:
+                if not self.is_encoder_decoder:
-                    x = wte(input_ids, mode="embedding")
+                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
-                except:
+                else:
-                    try:
+                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
-                        x = wte([input_ids], mode="embedding")
+                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
-                    except:
-                        try:
-                            x = wte([input_ids, None, None, None], mode="embedding")
-                        except:
-                            if hasattr(self.model_tester, "embedding_size"):
-                                x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
-                            else:
-                                x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-                # ^^ In our TF models, the input_embeddings can take slightly different forms,
-                # so we try a few of them.
-                # We used to fall back to just synthetically creating a dummy tensor of ones:
-                #
-                inputs_dict["inputs_embeds"] = x
                outputs = model(inputs_dict)

--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
+from transformers import T5Config, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
+                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+@require_tf
+class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
+    is_encoder_decoder = True
+    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
+    class TFT5ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_labels = None
+            if self.use_labels:
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            config = T5Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+            return (config, input_ids, input_mask, token_labels)
+        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+            model = TFT5Model(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            encoder_output, decoder_output = model(inputs)
+            encoder_output, decoder_output = model(input_ids,
+                                                   decoder_attention_mask=input_mask,
+                                                   encoder_input_ids=input_ids)
+            result = {
+                "encoder_output": encoder_output.numpy(),
+                "decoder_output": decoder_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+            model = TFT5WithLMHeadModel(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            prediction_scores, decoder_output = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, token_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': input_ids,
+                           'decoder_input_ids': input_ids,
+                           'decoder_attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in ['t5-small']:
+            model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
            self.clamp_len = clamp_len
            self.is_training = is_training
            self.use_labels = use_labels

--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
            self.clamp_len = clamp_len
            self.is_training = is_training
            self.use_labels = use_labels

--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
        assert encoded_sentence == [101] + text + [102]
        assert encoded_pair == [101] + text + [102] + text_2 + [102]
 if __name__ == '__main__':
    unittest.main()