Merge branch 'master' into generation_sampler

cfa03805 · thomwolf · 300ec300 · 8618bf15 · cfa03805 · cfa03805
Commit cfa03805 authored Dec 21, 2019 by thomwolf
20 changed files
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 from transformers import is_torch_available
@@ -25,11 +24,12 @@ if is_torch_available():
    import torch
    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
                              RobertaForSequenceClassification, RobertaForTokenClassification)
+    from transformers.modeling_roberta import RobertaEmbeddings
    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -199,12 +199,61 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = RobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
+    def test_create_position_ids_respects_padding_index(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = RobertaEmbeddings(config=config)
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor([[
+            0 + model.padding_idx + 1,
+            1 + model.padding_idx + 1,
+            2 + model.padding_idx + 1,
+            model.padding_idx
+        ]])
+        position_ids = model.create_position_ids_from_input_ids(input_ids)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+    def test_create_position_ids_from_inputs_embeds(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = RobertaEmbeddings(config=config)
+        inputs_embeds = torch.Tensor(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(
+            torch.all(torch.eq(position_ids, expected_positions))
+        )
 class RobertaModelIntegrationTest(unittest.TestCase):

--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+from transformers import is_torch_available
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
+@require_torch
+class T5ModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+    class T5ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     encoder_seq_length=7,
+                     decoder_seq_length=9,
+                     is_training=True,
+                     use_attention_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.encoder_seq_length = encoder_seq_length
+            self.decoder_seq_length = decoder_seq_length
+            self.is_training = is_training
+            self.use_attention_mask = use_attention_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+            encoder_attention_mask = None
+            decoder_attention_mask = None
+            if self.use_attention_mask:
+                encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+            decoder_lm_labels = None
+            if self.use_labels:
+                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+            config = T5Config(
+                vocab_size=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5Model(config=config)
+            model.eval()
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids,
+                                                   encoder_attention_mask=encoder_attention_mask,
+                                                   decoder_attention_mask=decoder_attention_mask)
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids)
+            result = {
+                "encoder_output": encoder_output,
+                "decoder_output": decoder_output,
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].size()),
+                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].size()),
+                [self.batch_size, self.decoder_seq_length, self.hidden_size])
+        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5WithLMHeadModel(config=config)
+            model.eval()
+            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
+                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+            loss, prediction_scores = outputs[0], outputs[1]
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+            self.check_loss_output(result)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
+             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': encoder_input_ids,
+                           'decoder_input_ids': decoder_input_ids,
+                           'decoder_attention_mask': decoder_attention_mask,
+                           'encoder_attention_mask': encoder_attention_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import AlbertConfig, is_tf_available
@@ -118,7 +117,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -217,12 +216,8 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFAlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-        for model_name in ['albert-base-uncased']:
-            model = TFAlbertModel.from_pretrained(
-                model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -22,7 +22,7 @@ import logging
 from transformers import is_tf_available
-from .utils import require_tf, slow
+from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 if is_tf_available():
    from transformers import (AutoConfig, BertConfig,
@@ -46,11 +46,11 @@ class TFAutoModelTest(unittest.TestCase):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModel.from_pretrained(model_name, force_download=True)
+            model = TFAutoModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertModel)
@@ -59,11 +59,11 @@ class TFAutoModelTest(unittest.TestCase):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelWithLMHead.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForMaskedLM)
@@ -72,11 +72,11 @@ class TFAutoModelTest(unittest.TestCase):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForSequenceClassification)
@@ -85,14 +85,19 @@ class TFAutoModelTest(unittest.TestCase):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForQuestionAnswering)
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, TFBertForMaskedLM)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import BertConfig, is_tf_available
@@ -114,7 +113,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -310,11 +309,9 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -69,6 +69,7 @@ class TFCommonTestCases:
        test_torchscript = True
        test_pruning = True
        test_resize_embeddings = True
+        is_encoder_decoder = False
        def test_initialization(self):
            pass
@@ -129,8 +130,12 @@ class TFCommonTestCases:
                                      for name, key in inputs_dict.items())
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict)
+                tfo = tf_model(inputs_dict, training=False)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tf_hidden_states = tfo[0].numpy()
+                pt_hidden_states = pto[0].numpy()
+                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
+                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
+                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
                self.assertLessEqual(max_diff, 2e-2)
                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
@@ -150,13 +155,21 @@ class TFCommonTestCases:
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
                tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tfo[0].numpy()
+                pto = pto[0].numpy()
+                tfo[np.isnan(tfo)] = 0
+                pto[np.isnan(pto)] = 0
+                max_diff = np.amax(np.abs(tfo - pto))
                self.assertLessEqual(max_diff, 2e-2)
        def test_compile_tf_model(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+            if self.is_encoder_decoder:
+                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
+                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+            else:
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
@@ -189,7 +202,7 @@ class TFCommonTestCases:
                outputs_dict = model(inputs_dict)
                inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids')
+                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
                outputs_keywords = model(input_ids, **inputs_keywords)
                output_dict = outputs_dict[0].numpy()
@@ -200,6 +213,11 @@ class TFCommonTestCases:
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
@@ -212,16 +230,28 @@ class TFCommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
                out_len = len(outputs)
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length])
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                outputs = model(inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
@@ -230,8 +260,8 @@ class TFCommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -264,35 +294,53 @@ class TFCommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-                self.assertTrue(tf.math.equal(first, second).numpy().all())
+                out_1 = first.numpy()
+                out_2 = second.numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+        def _get_embeds(self, wte, input_ids):
+            # ^^ In our TF models, the input_embeddings can take slightly different forms,
+            # so we try a few of them.
+            # We used to fall back to just synthetically creating a dummy tensor of ones:
+            try:
+                x = wte(input_ids, mode="embedding")
+            except:
+                try:
+                    x = wte([input_ids], mode="embedding")
+                except:
+                    try:
+                        x = wte([input_ids, None, None, None], mode="embedding")
+                    except:
+                        if hasattr(self.model_tester, "embedding_size"):
+                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        else:
+                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+            return x
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
-            del inputs_dict["input_ids"]
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
            for model_class in self.all_model_classes:
                model = model_class(config)
                wte = model.get_input_embeddings()
-                try:
+                if not self.is_encoder_decoder:
-                    x = wte(input_ids, mode="embedding")
+                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
-                except:
+                else:
-                    try:
+                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
-                        x = wte([input_ids], mode="embedding")
+                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
-                    except:
-                        try:
-                            x = wte([input_ids, None, None, None], mode="embedding")
-                        except:
-                            if hasattr(self.model_tester, "embedding_size"):
-                                x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
-                            else:
-                                x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-                # ^^ In our TF models, the input_embeddings can take slightly different forms,
-                # so we try a few of them.
-                # We used to fall back to just synthetically creating a dummy tensor of ones:
-                #
-                inputs_dict["inputs_embeds"] = x
                outputs = model(inputs_dict)

--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import CTRLConfig, is_tf_available
@@ -112,7 +111,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -189,10 +188,8 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFCTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -20,7 +20,7 @@ import unittest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import DistilBertConfig, is_tf_available
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,
@@ -211,10 +211,8 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
    # @slow
    # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/transformers_test/"
    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-    #         shutil.rmtree(cache_dir)
    #         self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import GPT2Config, is_tf_available
@@ -115,7 +114,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -220,10 +219,8 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import OpenAIGPTConfig, is_tf_available
@@ -114,7 +113,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -219,10 +218,8 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -17,11 +17,10 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import RobertaConfig, is_tf_available
@@ -109,7 +108,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -192,10 +191,8 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFRobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
+from transformers import T5Config, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
+                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+@require_tf
+class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
+    is_encoder_decoder = True
+    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
+    class TFT5ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_labels = None
+            if self.use_labels:
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            config = T5Config(
+                vocab_size=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+            return (config, input_ids, input_mask, token_labels)
+        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+            model = TFT5Model(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            encoder_output, decoder_output = model(inputs)
+            encoder_output, decoder_output = model(input_ids,
+                                                   decoder_attention_mask=input_mask,
+                                                   encoder_input_ids=input_ids)
+            result = {
+                "encoder_output": encoder_output.numpy(),
+                "decoder_output": decoder_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+            model = TFT5WithLMHeadModel(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            prediction_scores, decoder_output = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, token_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': input_ids,
+                           'decoder_input_ids': input_ids,
+                           'decoder_attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ['t5-small']:
+            model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -18,11 +18,10 @@ from __future__ import print_function
 import unittest
 import random
-import shutil
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import TransfoXLConfig, is_tf_available
@@ -67,7 +66,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
            self.clamp_len = clamp_len
            self.is_training = is_training
            self.use_labels = use_labels
@@ -92,7 +91,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                cutoffs=self.cutoffs,
@@ -205,10 +204,8 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 from transformers import is_tf_available
@@ -31,7 +30,7 @@ if is_tf_available():
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 @require_tf
@@ -125,7 +124,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
            config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                 n_special=self.n_special,
                 emb_dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
@@ -252,10 +251,8 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFXLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -20,7 +20,6 @@ import os
 import unittest
 import json
 import random
-import shutil
 from transformers import XLNetConfig, is_tf_available
@@ -35,7 +34,7 @@ if is_tf_available():
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 @require_tf
@@ -64,7 +63,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                     num_attention_heads=4,
                     d_inner=128,
                     num_hidden_layers=5,
-                     max_position_embeddings=10,
                     type_sequence_label_size=2,
                     untie_r=True,
                     bi_data=False,
@@ -88,7 +86,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
            self.num_attention_heads = num_attention_heads
            self.d_inner = d_inner
            self.num_hidden_layers = num_hidden_layers
-            self.max_position_embeddings = max_position_embeddings
            self.bi_data = bi_data
            self.untie_r = untie_r
            self.same_length = same_length
@@ -122,13 +119,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
            config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                d_model=self.hidden_size,
                n_head=self.num_attention_heads,
                d_inner=self.d_inner,
                n_layer=self.num_hidden_layers,
                untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                same_length=self.same_length,
@@ -322,10 +318,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFXLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFXLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 import unittest
 import random
-import shutil
 from transformers import is_torch_available
@@ -29,7 +28,7 @@ if is_torch_available():
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
@@ -66,7 +65,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
            self.clamp_len = clamp_len
            self.is_training = is_training
            self.use_labels = use_labels
@@ -91,7 +90,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                cutoffs=self.cutoffs,
@@ -208,10 +207,8 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
 from transformers import is_torch_available
@@ -28,7 +27,7 @@ if is_torch_available():
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
@@ -121,7 +120,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
            config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                 n_special=self.n_special,
                 emb_dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
@@ -318,10 +317,8 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -20,7 +20,6 @@ import os
 import unittest
 import json
 import random
-import shutil
 from transformers import is_torch_available
@@ -33,7 +32,7 @@ if is_torch_available():
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
@@ -60,7 +59,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                     num_attention_heads=4,
                     d_inner=128,
                     num_hidden_layers=5,
-                     max_position_embeddings=10,
                     type_sequence_label_size=2,
                     untie_r=True,
                     bi_data=False,
@@ -84,7 +82,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
            self.num_attention_heads = num_attention_heads
            self.d_inner = d_inner
            self.num_hidden_layers = num_hidden_layers
-            self.max_position_embeddings = max_position_embeddings
            self.bi_data = bi_data
            self.untie_r = untie_r
            self.same_length = same_length
@@ -116,13 +113,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
            config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                d_model=self.hidden_size,
                n_head=self.num_attention_heads,
                d_inner=self.d_inner,
                n_layer=self.num_hidden_layers,
                untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                same_length=self.same_length,
@@ -388,10 +384,8 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = XLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/pipelines_test.py
+++ b/transformers/tests/pipelines_test.py
+import unittest
+from typing import Iterable
+from transformers import pipeline
+from transformers.tests.utils import require_tf, require_torch
+QA_FINETUNED_MODELS = {
+    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+}
+TF_QA_FINETUNED_MODELS = {
+    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+}
+TF_NER_FINETUNED_MODELS = {
+    (
+        'bert-base-cased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+    )
+}
+NER_FINETUNED_MODELS = {
+    (
+        'bert-base-cased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+    )
+}
+FEATURE_EXTRACT_FINETUNED_MODELS = {
+   ('bert-base-cased', 'bert-base-cased', None),
+   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+}
+TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
+   ('bert-base-cased', 'bert-base-cased', None),
+   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+}
+TF_TEXT_CLASSIF_FINETUNED_MODELS = {
+    (
+        'bert-base-uncased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+    )
+}
+TEXT_CLASSIF_FINETUNED_MODELS = {
+    (
+        'bert-base-uncased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+    )
+}
+class MonoColumnInputTestCase(unittest.TestCase):
+    def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
+        self.assertIsNotNone(nlp)
+        mono_result = nlp(valid_inputs[0])
+        self.assertIsInstance(mono_result, list)
+        self.assertIsInstance(mono_result[0], (dict, list))
+        if isinstance(mono_result[0], list):
+            mono_result = mono_result[0]
+        for key in output_keys:
+            self.assertIn(key, mono_result[0])
+        multi_result = nlp(valid_inputs)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], (dict, list))
+        if isinstance(multi_result[0], list):
+            multi_result = multi_result[0]
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+        self.assertRaises(Exception, nlp, invalid_inputs)
+    @require_torch
+    def test_ner(self):
+        mandatory_keys = {'entity', 'word', 'score'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in NER_FINETUNED_MODELS:
+            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+    @require_tf
+    def test_tf_ner(self):
+        mandatory_keys = {'entity', 'word', 'score'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TF_NER_FINETUNED_MODELS:
+            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+    @require_torch
+    def test_sentiment_analysis(self):
+        mandatory_keys = {'label'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+    @require_tf
+    def test_tf_sentiment_analysis(self):
+        mandatory_keys = {'label'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+    @require_torch
+    def test_features_extraction(self):
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
+    @require_tf
+    def test_tf_features_extraction(self):
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
+class MultiColumnInputTestCase(unittest.TestCase):
+    def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
+        self.assertIsNotNone(nlp)
+        mono_result = nlp(valid_inputs[0])
+        self.assertIsInstance(mono_result, dict)
+        for key in output_keys:
+            self.assertIn(key, mono_result)
+        multi_result = nlp(valid_inputs)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], dict)
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+        self.assertRaises(Exception, nlp, invalid_inputs[0])
+        self.assertRaises(Exception, nlp, invalid_inputs)
+    @require_torch
+    def test_question_answering(self):
+        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        valid_samples = [
+            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {
+                'question': 'In what field is HuggingFace working ?',
+                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
+            }
+        ]
+        invalid_samples = [
+            {'question': '', 'context': 'This is a test to try empty question edge case'},
+            {'question': None, 'context': 'This is a test to try empty question edge case'},
+            {'question': 'What is does with empty context ?', 'context': ''},
+            {'question': 'What is does with empty context ?', 'context': None},
+        ]
+        for tokenizer, model, config in QA_FINETUNED_MODELS:
+            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
+    @require_tf
+    def test_tf_question_answering(self):
+        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        valid_samples = [
+            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {
+                'question': 'In what field is HuggingFace working ?',
+                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
+            }
+        ]
+        invalid_samples = [
+            {'question': '', 'context': 'This is a test to try empty question edge case'},
+            {'question': None, 'context': 'This is a test to try empty question edge case'},
+            {'question': 'What is does with empty context ?', 'context': ''},
+            {'question': 'What is does with empty context ?', 'context': None},
+        ]
+        for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
+            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
+if __name__ == '__main__':
+    unittest.main()
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -23,7 +23,7 @@ import logging
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .utils import slow
+from .utils import slow, SMALL_MODEL_IDENTIFIER
 class AutoTokenizerTest(unittest.TestCase):
@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
            self.assertIsInstance(tokenizer, GPT2Tokenizer)
            self.assertGreater(len(tokenizer), 0)
+    def test_tokenizer_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(tokenizer, BertTokenizer)
+        self.assertEqual(len(tokenizer), 12)
 if __name__ == "__main__":
    unittest.main()