Merge branch 'master' into fix-xlnet-squad2.0

562f8640 · Thomas Wolf · GitHub · ca99a2d5 · 8618bf15 · 562f8640
Unverified Commit 562f8640 authored Dec 21, 2019 by Thomas Wolf Committed by GitHub Dec 21, 2019
20 changed files
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 import copy
 import sys
-import os
+import os.path
 import shutil
 import tempfile
 import json
@@ -27,10 +27,11 @@ import uuid
 import unittest
 import logging
-import pytest
 from transformers import is_torch_available
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
    import torch
    import numpy as np
@@ -38,8 +39,6 @@ if is_torch_available():
    from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -59,12 +58,13 @@ else:
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if '_range' in key or '_std' in key or 'initializer_factor' in key:
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
 class CommonTestCases:
+    @require_torch
    class CommonModelTester(unittest.TestCase):
        model_tester = None
@@ -73,27 +73,30 @@ class CommonTestCases:
        test_pruning = True
        test_resize_embeddings = True
        test_head_masking = True
+        is_encoder_decoder = False
        def test_save_load(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                with torch.no_grad():
                    outputs = model(**inputs_dict)
+                out_2 = outputs[0].numpy()
+                out_2[np.isnan(out_2)] = 0
                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
                    model = model_class.from_pretrained(tmpdirname)
+                    model.to(torch_device)
                    with torch.no_grad():
                        after_outputs = model(**inputs_dict)
                    # Make sure we don't have nans
-                    out_1 = after_outputs[0].numpy()
+                    out_1 = after_outputs[0].cpu().numpy()
-                    out_2 = outputs[0].numpy()
+                    out_1[np.isnan(out_1)] = 0
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
                    max_diff = np.amax(np.abs(out_1 - out_2))
                    self.assertLessEqual(max_diff, 1e-5)
@@ -113,20 +116,34 @@ class CommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
-                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
+                with torch.no_grad():
-                self.assertEqual(first.ne(second).sum().item(), 0)
+                    first = model(**inputs_dict)[0]
+                    second = model(**inputs_dict)[0]
+                out_1 = first.cpu().numpy()
+                out_2 = second.cpu().numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
@@ -134,27 +151,42 @@ class CommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length ,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
                out_len = len(outputs)
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length
+                         ])
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
-                self.assertEqual(out_len+1, len(outputs))
+                    outputs = model(**inputs_dict)
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
-                attentions = outputs[-1]
+                self_attentions = outputs[-1]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -181,27 +213,32 @@ class CommonTestCases:
            configs_no_init.torchscript = True
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
+                model.to(torch_device)
                model.eval()
                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
                try:
-                    torch.jit.trace(model, inputs)
+                    traced_gpt2 = torch.jit.trace(model, inputs)
                except RuntimeError:
                    self.fail("Couldn't trace module.")
-                try:
+                with TemporaryDirectory() as tmp_dir_name:
-                    traced_gpt2 = torch.jit.trace(model, inputs)
+                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-                    torch.jit.save(traced_gpt2, "traced_model.pt")
-                except RuntimeError:
-                    self.fail("Couldn't save module.")
-                try:
+                    try:
-                    loaded_model = torch.jit.load("traced_model.pt")
+                        torch.jit.save(traced_gpt2, pt_file_name)
-                    os.remove("traced_model.pt")
+                    except Exception:
-                except ValueError:
+                        self.fail("Couldn't save module.")
-                    self.fail("Couldn't load module.")
+                    try:
+                        loaded_model = torch.jit.load(pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't load module.")
+                model.to(torch_device)
                model.eval()
+                loaded_model.to(torch_device)
                loaded_model.eval()
                model_params = model.parameters()
@@ -214,7 +251,6 @@ class CommonTestCases:
                self.assertTrue(models_equal)
        def test_headmasking(self):
            if not self.test_head_masking:
                return
@@ -228,11 +264,12 @@ class CommonTestCases:
            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
+                model.to(torch_device)
                model.eval()
                # Prepare head_mask
                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
                head_mask[0, 0] = 0
                head_mask[-1, :-1] = 0
                head_mask.requires_grad_(requires_grad=True)
@@ -268,7 +305,6 @@ class CommonTestCases:
                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
        def test_head_pruning(self):
            if not self.test_pruning:
                return
@@ -282,11 +318,13 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -310,23 +348,24 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
-                directory = "pruned_model"
-                if not os.path.exists(directory):
-                    os.makedirs(directory)
-                model.save_pretrained(directory)
-                model = model_class.from_pretrained(directory)
-                outputs = model(**inputs_dict)
+                with TemporaryDirectory() as temp_dir_name:
+                    model.save_pretrained(temp_dir_name)
+                    model = model_class.from_pretrained(temp_dir_name)
+                    model.to(torch_device)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-                shutil.rmtree(directory)
        def test_head_pruning_save_load_from_config_init(self):
            if not self.test_pruning:
@@ -346,9 +385,11 @@ class CommonTestCases:
                config.pruned_heads = heads_to_prune
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
@@ -372,9 +413,11 @@ class CommonTestCases:
                config.pruned_heads = heads_to_prune
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -382,15 +425,13 @@ class CommonTestCases:
                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-                directory = "pruned_model"
+                with TemporaryDirectory() as temp_dir_name:
+                    model.save_pretrained(temp_dir_name)
+                    model = model_class.from_pretrained(temp_dir_name)
+                    model.to(torch_device)
-                if not os.path.exists(directory):
+                with torch.no_grad():
-                    os.makedirs(directory)
+                    outputs = model(**inputs_dict)
-                model.save_pretrained(directory)
-                model = model_class.from_pretrained(directory)
-                shutil.rmtree(directory)
-                outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -401,7 +442,8 @@ class CommonTestCases:
                heads_to_prune = {0: [0], 2: [1, 2]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
@@ -411,7 +453,6 @@ class CommonTestCases:
                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -419,15 +460,18 @@ class CommonTestCases:
                config.output_hidden_states = True
                config.output_attentions = False
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
                self.assertEqual(model.config.output_attentions, False)
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                     self.model_tester.hidden_size])
        def test_resize_tokens_embeddings(self):
            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -533,17 +577,29 @@ class CommonTestCases:
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
-            del inputs_dict["input_ids"]
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                wte = model.get_input_embeddings()
-                inputs_dict["inputs_embeds"] = wte(input_ids)
+                if not self.is_encoder_decoder:
-                outputs = model(**inputs_dict)
+                    inputs_dict["inputs_embeds"] = wte(input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
    class GPTModelTester(CommonModelTester):
@@ -615,7 +671,7 @@ class CommonTestCases:
                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
            config = self.config_class(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_positions=self.n_positions,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
@@ -628,11 +684,13 @@ class CommonTestCases:
        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                                mc_labels, lm_labels, mc_token_ids):
            model = self.base_model_class(config)
+            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids)
+            with torch.no_grad():
-            outputs = model(input_ids, position_ids)
+                outputs = model(input_ids, position_ids, token_type_ids)
-            outputs = model(input_ids)
+                outputs = model(input_ids, position_ids)
+                outputs = model(input_ids)
            hidden_state = outputs[0]
            self.parent.assertListEqual(
@@ -643,8 +701,10 @@ class CommonTestCases:
        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.lm_head_model_class(config)
+            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]
            total_voc = self.vocab_size
@@ -659,8 +719,10 @@ class CommonTestCases:
                                        mc_labels, lm_labels, mc_token_ids):
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
-                outputs = model(input_ids)
+                with torch.no_grad():
+                    outputs = model(input_ids)
                presents = outputs[-1]
                self.parent.assertEqual(self.num_hidden_layers, len(presents))
                self.parent.assertListEqual(
@@ -671,8 +733,10 @@ class CommonTestCases:
        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.double_head_model_class(config)
+            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+            with torch.no_grad():
+                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
            loss = [lm_loss, mc_loss]
@@ -689,10 +753,8 @@ class CommonTestCases:
                [[], []])
        def create_and_check_model_from_pretrained(self):
-            cache_dir = "/tmp/transformers_test/"
            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
-                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
+                model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR)
-                shutil.rmtree(cache_dir)
                self.parent.assertIsNotNone(model)
        def prepare_config_and_inputs_for_common(self):
@@ -716,7 +778,7 @@ class CommonTestCases:
                config_and_inputs = self.prepare_config_and_inputs()
                self.create_and_check_presents(*config_and_inputs)
-        @pytest.mark.slow
+        @slow
        def run_slow_tests(self):
            self.create_and_check_model_from_pretrained()
@@ -770,7 +832,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 def floats_tensor(shape, scale=1.0, rng=None, name=None):
@@ -786,11 +848,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
    for _ in range(total_dims):
        values.append(rng.random() * scale)
-    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
+@require_torch
 class ModelUtilsTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -16,8 +16,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import pytest
-import shutil
 import pdb
 from transformers import is_torch_available
@@ -25,13 +23,13 @@ from transformers import is_torch_available
 if is_torch_available():
    from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    CTRLLMHeadModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+@require_torch
 class CTRLModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
@@ -115,7 +113,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -140,6 +138,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = CTRLModel(config=config)
+            model.to(torch_device)
            model.eval()
            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -157,6 +156,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = CTRLLMHeadModel(config)
+            model.to(torch_device)
            model.eval()
            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -202,12 +202,10 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = CTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = CTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import pytest
 from transformers import is_torch_available
@@ -25,13 +24,13 @@ if is_torch_available():
    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
                                    DistilBertForTokenClassification,
                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+@require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
@@ -106,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertModel(config=config)
+            model.to(torch_device)
            model.eval()
            (sequence_output,) = model(input_ids, input_mask)
            (sequence_output,) = model(input_ids)
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
            result = {
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
            result = {
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = DistilBertForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
            result = {
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = DistilBertForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
@@ -229,12 +233,10 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
-    # @pytest.mark.slow
+    # @slow
    # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/transformers_test/"
    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-    #         shutil.rmtree(cache_dir)
    #         self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -15,19 +15,18 @@
 import logging
 import unittest
-import pytest
 from transformers import is_torch_available
+from .utils import require_torch, slow
 if is_torch_available():
    from transformers import BertModel, BertForMaskedLM, Model2Model
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
+@require_torch
 class EncoderDecoderModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
    def test_model2model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -17,21 +17,19 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import pytest
-import shutil
 from transformers import is_torch_available
 if is_torch_available():
    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+@require_torch
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
@@ -111,7 +109,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -136,6 +134,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2Model(config=config)
+            model.to(torch_device)
            model.eval()
            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -153,6 +152,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2LMHeadModel(config)
+            model.to(torch_device)
            model.eval()
            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -171,6 +171,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
            model = GPT2DoubleHeadsModel(config)
+            model.to(torch_device)
            model.eval()
@@ -235,12 +236,10 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            model = GPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -17,21 +17,19 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import pytest
-import shutil
 from transformers import is_torch_available
 if is_torch_available():
    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+@require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
@@ -99,7 +97,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -124,6 +122,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTModel(config=config)
+            model.to(torch_device)
            model.eval()
            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -139,6 +138,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTLMHeadModel(config)
+            model.to(torch_device)
            model.eval()
            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -157,6 +157,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTDoubleHeadsModel(config)
+            model.to(torch_device)
            model.eval()
            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
@@ -203,12 +204,10 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -17,8 +17,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 from transformers import is_torch_available
@@ -26,14 +24,15 @@ if is_torch_available():
    import torch
    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
                              RobertaForSequenceClassification, RobertaForTokenClassification)
+    from transformers.modeling_roberta import RobertaEmbeddings
    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+@require_torch
 class RobertaModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
@@ -107,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -129,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                           token_labels, choice_labels):
            model = RobertaModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -146,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                                   token_labels, choice_labels):
            model = RobertaForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -161,6 +162,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                                                              sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = RobertaForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                 labels=token_labels)
@@ -195,22 +197,71 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = RobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
+    def test_create_position_ids_respects_padding_index(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = RobertaEmbeddings(config=config)
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor([[
+            0 + model.padding_idx + 1,
+            1 + model.padding_idx + 1,
+            2 + model.padding_idx + 1,
+            model.padding_idx
+        ]])
+        position_ids = model.create_position_ids_from_input_ids(input_ids)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+    def test_create_position_ids_from_inputs_embeds(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = RobertaEmbeddings(config=config)
+        inputs_embeds = torch.Tensor(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(
+            torch.all(torch.eq(position_ids, expected_positions))
+        )
 class RobertaModelIntegrationTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
    def test_inference_masked_lm(self):
        model = RobertaForMaskedLM.from_pretrained('roberta-base')
        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
@@ -228,10 +279,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )
-    @pytest.mark.slow
+    @slow
    def test_inference_no_head(self):
        model = RobertaModel.from_pretrained('roberta-base')
        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        # compare the actual values for a slice.
@@ -244,10 +295,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )
-    @pytest.mark.slow
+    @slow
    def test_inference_classification_head(self):
        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 3))

--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+from transformers import is_torch_available
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
+@require_torch
+class T5ModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+    class T5ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     encoder_seq_length=7,
+                     decoder_seq_length=9,
+                     is_training=True,
+                     use_attention_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.encoder_seq_length = encoder_seq_length
+            self.decoder_seq_length = decoder_seq_length
+            self.is_training = is_training
+            self.use_attention_mask = use_attention_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+            encoder_attention_mask = None
+            decoder_attention_mask = None
+            if self.use_attention_mask:
+                encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+            decoder_lm_labels = None
+            if self.use_labels:
+                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+            config = T5Config(
+                vocab_size=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5Model(config=config)
+            model.eval()
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids,
+                                                   encoder_attention_mask=encoder_attention_mask,
+                                                   decoder_attention_mask=decoder_attention_mask)
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids)
+            result = {
+                "encoder_output": encoder_output,
+                "decoder_output": decoder_output,
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].size()),
+                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].size()),
+                [self.batch_size, self.decoder_seq_length, self.hidden_size])
+        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5WithLMHeadModel(config=config)
+            model.eval()
+            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
+                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+            loss, prediction_scores = outputs[0], outputs[1]
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+            self.check_loss_output(result)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
+             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': encoder_input_ids,
+                           'decoder_input_ids': decoder_input_ids,
+                           'decoder_attention_mask': decoder_attention_mask,
+                           'encoder_attention_mask': encoder_attention_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
+from transformers import AlbertConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
+                                                 TFAlbertForSequenceClassification,
+                                                 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+@require_tf
+class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification
+    ) if is_tf_available() else ()
+    class TFAlbertModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     embedding_size=16,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.embedding_size = embedding_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor(
+                [self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor(
+                    [self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor(
+                    [self.batch_size, self.seq_length], self.type_vocab_size)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor(
+                    [self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor(
+                    [self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = AlbertConfig(
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFAlbertModel(config=config)
+            # inputs = {'input_ids': input_ids,
+            #           'attention_mask': input_mask,
+            #           'token_type_ids': token_type_ids}
+            # sequence_output, pooled_output = model(**inputs)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output, pooled_output = model(inputs)
+            inputs = [input_ids, input_mask]
+            sequence_output, pooled_output = model(inputs)
+            sequence_output, pooled_output = model(input_ids)
+            result = {
+                "sequence_output": sequence_output.numpy(),
+                "pooled_output": pooled_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [
+                                        self.batch_size, self.hidden_size])
+        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFAlbertForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFAlbertForSequenceClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids,
+                           'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=AlbertConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_albert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_model(*config_and_inputs)
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_masked_lm(
+            *config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_sequence_classification(
+            *config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFAlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function
 import unittest
 import shutil
-import pytest
 import logging
 from transformers import is_tf_available
+from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 if is_tf_available():
    from transformers import (AutoConfig, BertConfig,
                                      TFAutoModel, TFBertModel,
@@ -33,11 +34,11 @@ if is_tf_available():
    from .modeling_common_test import (CommonTestCases, ids_tensor)
    from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFAutoModelTest(unittest.TestCase):
+    @slow
    def test_model_from_pretrained(self):
        import h5py
        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
@@ -45,50 +46,58 @@ class TFAutoModelTest(unittest.TestCase):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModel.from_pretrained(model_name, force_download=True)
+            model = TFAutoModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertModel)
+    @slow
    def test_lmhead_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelWithLMHead.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForMaskedLM)
+    @slow
    def test_sequence_classification_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForSequenceClassification)
+    @slow
    def test_question_answering_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
-            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForQuestionAnswering)
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, TFBertForMaskedLM)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import BertConfig, is_tf_available
@@ -36,10 +35,9 @@ if is_tf_available():
                                                       TFBertForTokenClassification,
                                                       TFBertForQuestionAnswering,
                                                       TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
@@ -115,7 +113,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -309,13 +307,11 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ['bert-base-uncased']:
-            model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -25,18 +25,17 @@ import unittest
 import uuid
 import tempfile
-import pytest
 import sys
 from transformers import is_tf_available, is_torch_available
+from .utils import require_tf, slow
 if is_tf_available():
    import tensorflow as tf
    import numpy as np
    from transformers import TFPreTrainedModel
    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -62,6 +61,7 @@ def _config_zero_init(config):
 class TFCommonTestCases:
+    @require_tf
    class TFCommonModelTester(unittest.TestCase):
        model_tester = None
@@ -69,6 +69,7 @@ class TFCommonTestCases:
        test_torchscript = True
        test_pruning = True
        test_resize_embeddings = True
+        is_encoder_decoder = False
        def test_initialization(self):
            pass
@@ -129,8 +130,12 @@ class TFCommonTestCases:
                                      for name, key in inputs_dict.items())
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict)
+                tfo = tf_model(inputs_dict, training=False)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tf_hidden_states = tfo[0].numpy()
+                pt_hidden_states = pto[0].numpy()
+                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
+                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
+                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
                self.assertLessEqual(max_diff, 2e-2)
                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
@@ -150,13 +155,21 @@ class TFCommonTestCases:
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
                tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tfo[0].numpy()
+                pto = pto[0].numpy()
+                tfo[np.isnan(tfo)] = 0
+                pto[np.isnan(pto)] = 0
+                max_diff = np.amax(np.abs(tfo - pto))
                self.assertLessEqual(max_diff, 2e-2)
        def test_compile_tf_model(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+            if self.is_encoder_decoder:
+                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
+                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+            else:
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
@@ -164,7 +177,7 @@ class TFCommonTestCases:
            for model_class in self.all_model_classes:
                # Prepare our model
                model = model_class(config)
                # Let's load it from the disk to be sure we can use pretrained weights
                with TemporaryDirectory() as tmpdirname:
                    outputs = model(inputs_dict)  # build the model
@@ -189,7 +202,7 @@ class TFCommonTestCases:
                outputs_dict = model(inputs_dict)
                inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids')
+                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
                outputs_keywords = model(input_ids, **inputs_keywords)
                output_dict = outputs_dict[0].numpy()
@@ -200,6 +213,11 @@ class TFCommonTestCases:
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
@@ -212,16 +230,28 @@ class TFCommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
                out_len = len(outputs)
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length])
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                outputs = model(inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
@@ -230,82 +260,8 @@ class TFCommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
-        def test_headmasking(self):
-            pass
-            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            # config.output_attentions = True
-            # config.output_hidden_states = True
-            # configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-            # for model_class in self.all_model_classes:
-            #     model = model_class(config=configs_no_init)
-            #     model.eval()
-            #     # Prepare head_mask
-            #     # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-            #     head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
-            #     head_mask[0, 0] = 0
-            #     head_mask[-1, :-1] = 0
-            #     head_mask.requires_grad_(requires_grad=True)
-            #     inputs = inputs_dict.copy()
-            #     inputs['head_mask'] = head_mask
-            #     outputs = model(**inputs)
-            #     # Test that we can get a gradient back for importance score computation
-            #     output = sum(t.sum() for t in outputs[0])
-            #     output = output.sum()
-            #     output.backward()
-            #     multihead_outputs = head_mask.grad
-            #     attentions = outputs[-1]
-            #     hidden_states = outputs[-2]
-            #     # Remove Nan
-            #     self.assertIsNotNone(multihead_outputs)
-            #     self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-            #     self.assertAlmostEqual(
-            #         attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-            #     self.assertNotEqual(
-            #         attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-            #     self.assertNotEqual(
-            #         attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-            #     self.assertAlmostEqual(
-            #         attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-            #     self.assertNotEqual(
-            #         attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-        def test_head_pruning(self):
-            pass
-            # if not self.test_pruning:
-            #     return
-            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            # for model_class in self.all_model_classes:
-            #     config.output_attentions = True
-            #     config.output_hidden_states = False
-            #     model = model_class(config=config)
-            #     model.eval()
-            #     heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-            #                     -1: [0]}
-            #     model.prune_heads(heads_to_prune)
-            #     outputs = model(**inputs_dict)
-            #     attentions = outputs[-1]
-            #     self.assertEqual(
-            #         attentions[0].shape[-3], 1)
-            #     self.assertEqual(
-            #         attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            #     self.assertEqual(
-            #         attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -323,43 +279,6 @@ class TFCommonTestCases:
                    list(hidden_states[0].shape[-2:]),
                    [self.model_tester.seq_length, self.model_tester.hidden_size])
-        def test_resize_tokens_embeddings(self):
-            pass
-            # original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            # if not self.test_resize_embeddings:
-            #     return
-            # for model_class in self.all_model_classes:
-            #     config = copy.deepcopy(original_config)
-            #     model = model_class(config)
-            #     model_vocab_size = config.vocab_size
-            #     # Retrieve the embeddings and clone theme
-            #     model_embed = model.resize_token_embeddings(model_vocab_size)
-            #     cloned_embeddings = model_embed.weight.clone()
-            #     # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            #     model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            #     self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            #     # Check that it actually resizes the embeddings matrix
-            #     self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            #     # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            #     model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            #     self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            #     # Check that it actually resizes the embeddings matrix
-            #     self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-            #     # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            #     models_equal = True
-            #     for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-            #         if p1.data.ne(p2.data).sum() > 0:
-            #             models_equal = False
-            #     self.assertTrue(models_equal)
        def test_model_common_attributes(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -369,67 +288,59 @@ class TFCommonTestCases:
                x = model.get_output_embeddings()
                assert x is None or isinstance(x, tf.keras.layers.Layer)
-        def test_tie_model_weights(self):
-            pass
-            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            # def check_same_values(layer_1, layer_2):
-            #     equal = True
-            #     for p1, p2 in zip(layer_1.weight, layer_2.weight):
-            #         if p1.data.ne(p2.data).sum() > 0:
-            #             equal = False
-            #     return equal
-            # for model_class in self.all_model_classes:
-            #     if not hasattr(model_class, 'tie_weights'):
-            #         continue
-            #     config.torchscript = True
-            #     model_not_tied = model_class(config)
-            #     params_not_tied = list(model_not_tied.parameters())
-            #     config_tied = copy.deepcopy(config)
-            #     config_tied.torchscript = False
-            #     model_tied = model_class(config_tied)
-            #     params_tied = list(model_tied.parameters())
-            #     # Check that the embedding layer and decoding layer are the same in size and in value
-            #     self.assertGreater(len(params_not_tied), len(params_tied))
-            #     # Check that after resize they remain tied.
-            #     model_tied.resize_token_embeddings(config.vocab_size + 10)
-            #     params_tied_2 = list(model_tied.parameters())
-            #     self.assertGreater(len(params_not_tied), len(params_tied))
-            #     self.assertEqual(len(params_tied_2), len(params_tied))
        def test_determinism(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            for model_class in self.all_model_classes:
                model = model_class(config)
                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-                self.assertTrue(tf.math.equal(first, second).numpy().all())
+                out_1 = first.numpy()
+                out_2 = second.numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+        def _get_embeds(self, wte, input_ids):
+            # ^^ In our TF models, the input_embeddings can take slightly different forms,
+            # so we try a few of them.
+            # We used to fall back to just synthetically creating a dummy tensor of ones:
+            try:
+                x = wte(input_ids, mode="embedding")
+            except:
+                try:
+                    x = wte([input_ids], mode="embedding")
+                except:
+                    try:
+                        x = wte([input_ids, None, None, None], mode="embedding")
+                    except:
+                        if hasattr(self.model_tester, "embedding_size"):
+                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        else:
+                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+            return x
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
-            del inputs_dict["input_ids"]
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
            for model_class in self.all_model_classes:
                model = model_class(config)
                wte = model.get_input_embeddings()
-                try:
+                if not self.is_encoder_decoder:
-                    x = wte(input_ids, mode="embedding")
+                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
-                except:
+                else:
-                    try:
+                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
-                        x = wte([input_ids], mode="embedding")
+                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
-                    except:
-                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-                # ^^ In our TF models, the input_embeddings can take slightly different forms,
-                # so we try two of them and fall back to just synthetically creating a dummy tensor of ones.
-                inputs_dict["inputs_embeds"] = x
                outputs = model(inputs_dict)
@@ -453,29 +364,5 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
    return output
-class TFModelUtilsTest(unittest.TestCase):
-    @pytest.mark.skipif('tensorflow' not in sys.modules, reason="requires TensorFlow")
-    def test_model_from_pretrained(self):
-        pass
-        # logging.basicConfig(level=logging.INFO)
-        # for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        #     config = BertConfig.from_pretrained(model_name)
-        #     self.assertIsNotNone(config)
-        #     self.assertIsInstance(config, PretrainedConfig)
-        #     model = BertModel.from_pretrained(model_name)
-        #     model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-        #     self.assertIsNotNone(model)
-        #     self.assertIsInstance(model, PreTrainedModel)
-        #     for value in loading_info.values():
-        #         self.assertEqual(len(value), 0)
-        #     config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-        #     model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-        #     self.assertEqual(model.config.output_attentions, True)
-        #     self.assertEqual(model.config.output_hidden_states, True)
-        #     self.assertEqual(model.config, config)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import CTRLConfig, is_tf_available
@@ -30,10 +29,9 @@ if is_tf_available():
    import tensorflow as tf
    from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
                                                TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
@@ -113,7 +111,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -188,12 +186,10 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFCTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -17,10 +17,10 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import pytest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import DistilBertConfig, is_tf_available
@@ -30,10 +30,9 @@ if is_tf_available():
                                                             TFDistilBertForMaskedLM,
                                                             TFDistilBertForQuestionAnswering,
                                                             TFDistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
@@ -108,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,
@@ -210,12 +209,10 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
-    # @pytest.mark.slow
+    # @slow
    # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/transformers_test/"
    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-    #         shutil.rmtree(cache_dir)
    #         self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import GPT2Config, is_tf_available
@@ -31,10 +30,9 @@ if is_tf_available():
    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
                                                       TFGPT2DoubleHeadsModel,
                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
@@ -116,7 +114,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -219,12 +217,10 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import OpenAIGPTConfig, is_tf_available
@@ -31,10 +30,9 @@ if is_tf_available():
    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
                                                         TFOpenAIGPTDoubleHeadsModel,
                                                         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
@@ -115,7 +113,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -218,12 +216,10 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 if __name__ == "__main__":

--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -17,11 +17,10 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import RobertaConfig, is_tf_available
@@ -32,10 +31,9 @@ if is_tf_available():
                                                          TFRobertaForSequenceClassification,
                                                          TFRobertaForTokenClassification,
                                                          TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
@@ -110,7 +108,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -191,22 +189,20 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFRobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
 class TFRobertaModelIntegrationTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
    def test_inference_masked_lm(self):
        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = [1, 11, 50265]
@@ -224,10 +220,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
        )
-    @pytest.mark.slow
+    @slow
    def test_inference_no_head(self):
        model = TFRobertaModel.from_pretrained('roberta-base')
        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        # compare the actual values for a slice.
@@ -240,10 +236,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
        )
-    @pytest.mark.slow
+    @slow
    def test_inference_classification_head(self):
        model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = [1, 3]

--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
+from transformers import T5Config, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
+                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+@require_tf
+class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
+    is_encoder_decoder = True
+    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
+    class TFT5ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_labels = None
+            if self.use_labels:
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            config = T5Config(
+                vocab_size=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+            return (config, input_ids, input_mask, token_labels)
+        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+            model = TFT5Model(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            encoder_output, decoder_output = model(inputs)
+            encoder_output, decoder_output = model(input_ids,
+                                                   decoder_attention_mask=input_mask,
+                                                   encoder_input_ids=input_ids)
+            result = {
+                "encoder_output": encoder_output.numpy(),
+                "decoder_output": decoder_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+            model = TFT5WithLMHeadModel(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            prediction_scores, decoder_output = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, token_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': input_ids,
+                           'decoder_input_ids': input_ids,
+                           'decoder_attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ['t5-small']:
+            model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -18,11 +18,10 @@ from __future__ import print_function
 import unittest
 import random
-import shutil
-import pytest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
 from transformers import TransfoXLConfig, is_tf_available
@@ -31,10 +30,9 @@ if is_tf_available():
    from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
                                                             TFTransfoXLLMHeadModel,
                                                             TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+@require_tf
 class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
@@ -68,7 +66,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
            self.batch_size = batch_size
            self.seq_length = seq_length
            self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
            self.clamp_len = clamp_len
            self.is_training = is_training
            self.use_labels = use_labels
@@ -93,7 +91,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                cutoffs=self.cutoffs,
@@ -204,12 +202,10 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -17,8 +17,6 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-import shutil
-import pytest
 from transformers import is_tf_available
@@ -29,13 +27,13 @@ if is_tf_available():
                                      TFXLMForSequenceClassification,
                                      TFXLMForQuestionAnsweringSimple,
                                      TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
+@require_tf
 class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
@@ -126,7 +124,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
            config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                 n_special=self.n_special,
                 emb_dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
@@ -251,12 +249,10 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            model = TFXLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)