Merge pull request #1203 from huggingface/tf2

[2.0] TF 2.0 support

Merge pull request #1203 from huggingface/tf2
[2.0] TF 2.0 support
17ea43cf · Thomas Wolf · GitHub · 4a233e5b · 80bf868a · 17ea43cf
Unverified Commit 17ea43cf authored Sep 26, 2019 by Thomas Wolf Committed by GitHub Sep 26, 2019
20 changed files
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import pytest
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from transformers import DistilBertConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_distilbert import (TFDistilBertModel,
+                                                             TFDistilBertForMaskedLM,
+                                                             TFDistilBertForQuestionAnswering,
+                                                             TFDistilBertForSequenceClassification)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
+                         TFDistilBertForSequenceClassification) if is_tf_available() else None
+    test_pruning = True
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_head_masking = True
+    class TFDistilBertModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=False,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = DistilBertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                hidden_dim=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range)
+            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFDistilBertModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            outputs = model(inputs)
+            sequence_output = outputs[0]
+            inputs = [input_ids, input_mask]
+            (sequence_output,) = model(inputs)
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFDistilBertForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            (prediction_scores,) = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFDistilBertForQuestionAnswering(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            start_logits, end_logits = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFDistilBertForSequenceClassification(config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            (logits,) = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFDistilBertModelTest.TFDistilBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_distilbert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
+    # @pytest.mark.slow
+    # def test_model_from_pretrained(self):
+    #     cache_dir = "/tmp/transformers_test/"
+    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         shutil.rmtree(cache_dir)
+    #         self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from transformers import GPT2Config, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
+                                                       TFGPT2DoubleHeadsModel,
+                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
+                         TFGPT2DoubleHeadsModel) if is_tf_available() else ()
+    # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
+    class TFGPT2ModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     use_mc_token_ids=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.use_mc_token_ids = use_mc_token_ids
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            mc_token_ids = None
+            if self.use_mc_token_ids:
+                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = GPT2Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2Model(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+            sequence_output = model(inputs)[0]
+            sequence_output = model(input_ids)[0]
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2LMHeadModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores = model(inputs)[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+            model = TFGPT2DoubleHeadsModel(config=config)
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'mc_token_ids': mc_token_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids}
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {
+                "lm_logits": lm_logits.numpy(),
+                "mc_logits": mc_logits.numpy()
+            }
+            self.parent.assertListEqual(
+                list(result["lm_logits"].shape),
+                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].shape),
+                [self.batch_size, self.num_choices])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
+    def test_gpt2_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
+    def test_gpt2_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_gpt2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from transformers import OpenAIGPTConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+                                                         TFOpenAIGPTDoubleHeadsModel,
+                                                         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+                         TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+    class TFOpenAIGPTModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     use_mc_token_ids=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.use_mc_token_ids = use_mc_token_ids
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            mc_token_ids = None
+            if self.use_mc_token_ids:
+                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = OpenAIGPTConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+        def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFOpenAIGPTModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+            inputs = [input_ids, input_mask]
+            sequence_output = model(inputs)[0]
+            sequence_output = model(input_ids)[0]
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFOpenAIGPTLMHeadModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores = model(inputs)[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_openai_gpt_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+            model = TFOpenAIGPTDoubleHeadsModel(config=config)
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'mc_token_ids': mc_token_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids}
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {
+                "lm_logits": lm_logits.numpy(),
+                "mc_logits": mc_logits.numpy()
+            }
+            self.parent.assertListEqual(
+                list(result["lm_logits"].shape),
+                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].shape),
+                [self.batch_size, self.num_choices])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFOpenAIGPTModelTest.TFOpenAIGPTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_openai_gpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
+    def test_openai_gpt_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_lm_head(*config_and_inputs)
+    def test_openai_gpt_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from transformers import RobertaConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    import numpy
+    from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
+                                                          TFRobertaForSequenceClassification,
+                                                          TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
+                         TFRobertaForSequenceClassification) if is_tf_available() else ()
+    class TFRobertaModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = RobertaConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                           token_labels, choice_labels):
+            model = TFRobertaModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+            inputs = [input_ids, input_mask]
+            sequence_output = model(inputs)[0]
+            sequence_output = model(input_ids)[0]
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                                   token_labels, choice_labels):
+            model = TFRobertaForMaskedLM(config=config)
+            prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFRobertaModelTest.TFRobertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_roberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+class TFRobertaModelIntegrationTest(unittest.TestCase):
+    @pytest.mark.slow
+    def test_inference_masked_lm(self):
+        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
+        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = [1, 11, 50265]
+        self.assertEqual(
+            list(output.numpy().shape),
+            expected_shape
+        )
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [[[33.8843, -4.3107, 22.7779],
+              [ 4.6533, -2.8099, 13.6252],
+              [ 1.8222, -3.6898,  8.8600]]]
+        )
+        self.assertTrue(
+            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+        )
+    @pytest.mark.slow
+    def test_inference_no_head(self):
+        model = TFRobertaModel.from_pretrained('roberta-base')
+        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [[[-0.0231,  0.0782,  0.0074],
+              [-0.1854,  0.0539, -0.0174],
+              [ 0.0548,  0.0799,  0.1687]]]
+        )
+        self.assertTrue(
+            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+        )
+    @pytest.mark.slow
+    def test_inference_classification_head(self):
+        model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = [1, 3]
+        self.assertEqual(
+            list(output.numpy().shape),
+            expected_shape
+        )
+        expected_tensor = tf.constant([[-0.9469,  0.3913,  0.5118]])
+        self.assertTrue(
+            numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3)
+        )
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import random
+import shutil
+import pytest
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from transformers import TransfoXLConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
+                                                             TFTransfoXLLMHeadModel,
+                                                             TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    class TFTransfoXLModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=30,
+                     clamp_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     hidden_size=32,
+                     d_embed=32,
+                     num_attention_heads=4,
+                     d_head=8,
+                     d_inner=128,
+                     div_val=2,
+                     num_hidden_layers=5,
+                     scope=None,
+                     seed=1,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            self.key_len = seq_length + mem_len
+            self.clamp_len = clamp_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.hidden_size = hidden_size
+            self.d_embed = d_embed
+            self.num_attention_heads = num_attention_heads
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.num_hidden_layers = num_hidden_layers
+            self.scope = scope
+            self.seed = seed
+        def prepare_config_and_inputs(self):
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            lm_labels = None
+            if self.use_labels:
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            config = TransfoXLConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                cutoffs=self.cutoffs,
+                d_model=self.hidden_size,
+                d_embed=self.d_embed,
+                n_head=self.num_attention_heads,
+                d_head=self.d_head,
+                d_inner=self.d_inner,
+                div_val=self.div_val,
+                n_layer=self.num_hidden_layers)
+            return (config, input_ids_1, input_ids_2, lm_labels)
+        def set_seed(self):
+            random.seed(self.seed)
+            tf.random.set_seed(self.seed)
+        def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TFTransfoXLModel(config)
+            hidden_states_1, mems_1 = model(input_ids_1)
+            inputs = {'input_ids': input_ids_2,
+                      'mems': mems_1}
+            hidden_states_2, mems_2 = model(inputs)
+            result = {
+                "hidden_states_1": hidden_states_1.numpy(),
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "hidden_states_2": hidden_states_2.numpy(),
+                "mems_2": [mem.numpy() for mem in mems_2],
+            }
+            self.parent.assertListEqual(
+                list(result["hidden_states_1"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+        def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TFTransfoXLLMHeadModel(config)
+            lm_logits_1, mems_1 = model(input_ids_1)
+            inputs = {'input_ids': input_ids_1,
+                      'labels': lm_labels}
+            _, mems_1 = model(inputs)
+            lm_logits_2, mems_2 = model([input_ids_2, mems_1])
+            inputs = {'input_ids': input_ids_1,
+                      'mems': mems_1,
+                      'labels': lm_labels}
+            _, mems_2 = model(inputs)
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "lm_logits_1": lm_logits_1.numpy(),
+                "mems_2": [mem.numpy() for mem in mems_2],
+                "lm_logits_2": lm_logits_2.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids_1}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_model(*config_and_inputs)
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+from transformers import is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import (XLMConfig, TFXLMModel,
+                                      TFXLMWithLMHeadModel,
+                                      TFXLMForSequenceClassification,
+                                      TFXLMForQuestionAnsweringSimple,
+                                      TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
+                         TFXLMForSequenceClassification,
+                         TFXLMForQuestionAnsweringSimple) if is_tf_available() else ()
+    class TFXLMModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_lengths=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     gelu_activation=True,
+                     sinusoidal_embeddings=False,
+                     causal=False,
+                     asm=False,
+                     n_langs=2,
+                     vocab_size=99,
+                     n_special=0,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     summary_type="last",
+                     use_proj=True,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_lengths = use_input_lengths
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.asm = asm
+            self.n_langs = n_langs
+            self.vocab_size = vocab_size
+            self.n_special = n_special
+            self.summary_type = summary_type
+            self.causal = causal
+            self.use_proj = use_proj
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.n_langs = n_langs
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.summary_type = summary_type
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+            input_lengths = None
+            if self.use_input_lengths:
+                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+            sequence_labels = None
+            token_labels = None
+            is_impossible_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+            config = XLMConfig(
+                 vocab_size_or_config_json_file=self.vocab_size,
+                 n_special=self.n_special,
+                 emb_dim=self.hidden_size,
+                 n_layers=self.num_hidden_layers,
+                 n_heads=self.num_attention_heads,
+                 dropout=self.hidden_dropout_prob,
+                 attention_dropout=self.attention_probs_dropout_prob,
+                 gelu_activation=self.gelu_activation,
+                 sinusoidal_embeddings=self.sinusoidal_embeddings,
+                 asm=self.asm,
+                 causal=self.causal,
+                 n_langs=self.n_langs,
+                 max_position_embeddings=self.max_position_embeddings,
+                 initializer_range=self.initializer_range,
+                 summary_type=self.summary_type,
+                 use_proj=self.use_proj)
+            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
+        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths,
+                      'langs': token_type_ids}
+            outputs = model(inputs)
+            inputs = [input_ids, input_mask]
+            outputs = model(inputs)
+            sequence_output = outputs[0]
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMWithLMHeadModel(config)
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths,
+                      'langs': token_type_ids}
+            outputs = model(inputs)
+            logits = outputs[0]
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMForQuestionAnsweringSimple(config)
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths}
+            outputs = model(inputs)
+            start_logits, end_logits = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMForSequenceClassification(config)
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths}
+            (logits,) = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.type_sequence_label_size])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_lengths,
+             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'langs': token_type_ids, 'lengths': input_lengths}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFXLMModelTest.TFXLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_xlm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
+    def test_xlm_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
+    def test_xlm_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
+    def test_xlm_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+from transformers import XLNetConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
+                                                        TFXLNetForSequenceClassification,
+                                                        TFXLNetForQuestionAnsweringSimple,
+                                                        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
+                       TFXLNetForSequenceClassification,
+                       TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
+    test_pruning = False
+    class TFXLNetModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=10,
+                     clamp_len=-1,
+                     reuse_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     hidden_size=32,
+                     num_attention_heads=4,
+                     d_inner=128,
+                     num_hidden_layers=5,
+                     max_position_embeddings=10,
+                     type_sequence_label_size=2,
+                     untie_r=True,
+                     bi_data=False,
+                     same_length=False,
+                     initializer_range=0.05,
+                     seed=1,
+                     type_vocab_size=2,
+            ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            # self.key_len = seq_length + mem_len
+            self.clamp_len = clamp_len
+            self.reuse_len = reuse_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.hidden_size = hidden_size
+            self.num_attention_heads = num_attention_heads
+            self.d_inner = d_inner
+            self.num_hidden_layers = num_hidden_layers
+            self.max_position_embeddings = max_position_embeddings
+            self.bi_data = bi_data
+            self.untie_r = untie_r
+            self.same_length = same_length
+            self.initializer_range = initializer_range
+            self.seed = seed
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+        def prepare_config_and_inputs(self):
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+            perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
+            perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
+            perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
+            # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
+            target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
+            target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
+            # target_mapping[:, 0, -1] = 1.0  # predict last token
+            sequence_labels = None
+            lm_labels = None
+            is_impossible_labels = None
+            if self.use_labels:
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+            config = XLNetConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                d_model=self.hidden_size,
+                n_head=self.num_attention_heads,
+                d_inner=self.d_inner,
+                n_layer=self.num_hidden_layers,
+                untie_r=self.untie_r,
+                max_position_embeddings=self.max_position_embeddings,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                same_length=self.same_length,
+                reuse_len=self.reuse_len,
+                bi_data=self.bi_data,
+                initializer_range=self.initializer_range,
+                num_labels=self.type_sequence_label_size)
+            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
+        def set_seed(self):
+            random.seed(self.seed)
+            tf.random.set_seed(self.seed)
+        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetModel(config)
+            inputs = {'input_ids': input_ids_1,
+                      'input_mask': input_mask,
+                      'token_type_ids': segment_ids}
+            _, _ = model(inputs)
+            inputs = [input_ids_1, input_mask]
+            outputs, mems_1 = model(inputs)
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "outputs": outputs.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["outputs"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetLMHeadModel(config)
+            inputs_1 = {'input_ids': input_ids_1,
+                      'token_type_ids': segment_ids}
+            all_logits_1, mems_1 = model(inputs_1)
+            inputs_2 = {'input_ids': input_ids_2,
+                        'mems': mems_1,
+                        'token_type_ids': segment_ids}
+            all_logits_2, mems_2 = model(inputs_2)
+            inputs_3 = {'input_ids': input_ids_q,
+                        'perm_mask': perm_mask,
+                        'target_mapping': target_mapping}
+            logits, _ = model(inputs_3)
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "all_logits_1": all_logits_1.numpy(),
+                "mems_2": [mem.numpy() for mem in mems_2],
+                "all_logits_2": all_logits_2.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["all_logits_1"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+            self.parent.assertListEqual(
+                list(result["all_logits_2"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetForQuestionAnsweringSimple(config)
+            inputs = {'input_ids': input_ids_1,
+                      'attention_mask': input_mask,
+                      'token_type_ids': segment_ids}
+            start_logits, end_logits, mems = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+                "mems": [m.numpy() for m in mems],
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetForSequenceClassification(config)
+            logits, mems_1 = model(input_ids_1)
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.type_sequence_label_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels,
+                sequence_labels, is_impossible_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids_1}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFXLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -21,17 +21,21 @@ import random
 import shutil
 import pytest
-import torch
+from transformers import is_torch_available
-from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+if is_torch_available():
-from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+    import torch
+    from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+    from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
+    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
    test_pruning = False
    test_torchscript = False
    test_resize_embeddings = False
@@ -202,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -20,8 +20,14 @@ import unittest
 import shutil
 import pytest
-from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from transformers import is_torch_available
-from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+if is_torch_available():
+    from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
+                                      XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
+    from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -29,9 +35,9 @@ from .configuration_common_test import ConfigTester
 class XLMModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
+    all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
-                         XLMForQuestionAnswering, XLMForSequenceClassification) 
+                         XLMForSequenceClassification, XLMForQuestionAnsweringSimple) if is_torch_available() else ()
-                         # , XLMForSequenceClassification, XLMForTokenClassification),
    class XLMModelTester(object):
@@ -174,12 +180,36 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMForQuestionAnsweringSimple(config)
+            model.eval()
+            outputs = model(input_ids)
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                       end_positions=sequence_labels)
+            loss, start_logits, end_logits = outputs
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
            model = XLMForQuestionAnswering(config)
            model.eval()
            outputs = model(input_ids)
-            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
+            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
            outputs = model(input_ids, start_positions=sequence_labels,
                                         end_positions=sequence_labels,
@@ -266,24 +296,25 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
-        # config_and_inputs = tester.prepare_config_and_inputs()
+    def test_xlm_lm_head(self):
-        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
-        # config_and_inputs = tester.prepare_config_and_inputs()
-        # tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
-        # config_and_inputs = tester.prepare_config_and_inputs()
+    def test_xlm_simple_qa(self):
-        # tester.create_and_check_xlm_for_question_answering(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs)
-        # config_and_inputs = tester.prepare_config_and_inputs()
+    def test_xlm_qa(self):
-        # tester.create_and_check_xlm_for_sequence_classification(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
-        # config_and_inputs = tester.prepare_config_and_inputs()
+    def test_xlm_sequence_classif(self):
-        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -23,10 +23,15 @@ import random
 import shutil
 import pytest
-import torch
+from transformers import is_torch_available
-from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+if is_torch_available():
-from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+    import torch
+    from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+    from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -34,7 +39,7 @@ from .configuration_common_test import ConfigTester
 class XLNetModelTest(CommonTestCases.CommonModelTester):
    all_model_classes=(XLNetModel, XLNetLMHeadModel,
-                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
+                    XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else ()
    test_pruning = False
    class XLNetModelTester(object):
@@ -312,7 +317,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -18,11 +18,17 @@ from __future__ import print_function
 import unittest
 import os
+import pytest
-import torch
+from transformers import is_torch_available
-from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
+if is_torch_available():
-                                  WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+    import torch
+    from transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
+                                    WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .tokenization_tests_commons import TemporaryDirectory
@@ -71,8 +77,8 @@ class OptimizationTest(unittest.TestCase):
 class ScheduleInitTest(unittest.TestCase):
-    m = torch.nn.Linear(50, 50)
+    m = torch.nn.Linear(50, 50) if is_torch_available() else None
-    optimizer = AdamW(m.parameters(), lr=10.)
+    optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
    num_steps = 10
    def assertListAlmostEqual(self, list1, list2, tol):

--- a/pytorch_transformers/tests/tokenization_auto_test.py
+++ b/pytorch_transformers/tests/tokenization_auto_test.py
@@ -21,21 +21,20 @@ import shutil
 import pytest
 import logging
-from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
+from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
-from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
 class AutoTokenizerTest(unittest.TestCase):
    def test_tokenizer_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, BertTokenizer)
            self.assertGreater(len(tokenizer), 0)
-        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)

--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
-from pytorch_transformers.tokenization_bert import (BasicTokenizer,
+from transformers.tokenization_bert import (BasicTokenizer,
                                                    BertTokenizer,
                                                    WordpieceTokenizer,
                                                    _is_control, _is_punctuation,
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
        assert encoded_sentence == [101] + text + [102]
        assert encoded_pair == [101] + text + [102] + text_2 + [102]

--- a/pytorch_transformers/tests/tokenization_dilbert_test.py
+++ b/pytorch_transformers/tests/tokenization_dilbert_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
-from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
+from transformers.tokenization_distilbert import (DistilBertTokenizer)
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
@@ -36,11 +36,13 @@ class DistilBertTokenizationTest(BertTokenizationTest):
        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
+               text_2 + [tokenizer.sep_token_id]
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
 if __name__ == '__main__':
    unittest.main()
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -19,7 +19,7 @@ import unittest
 import json
 from io import open
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
@@ -52,14 +52,14 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
    def get_input_output_texts(self):
        input_text = u"lower newer"
-        output_text = u" lower newer"
+        output_text = u"lower newer"
        return input_text, output_text
    def test_full_tokenizer(self):
        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
        self.assertListEqual(tokens, bpe_tokens)
        input_tokens = tokens + [tokenizer.unk_token]

--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 import json
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases

--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -19,7 +19,7 @@ import json
 import unittest
 from io import open
-from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
@@ -51,14 +51,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
    def get_input_output_texts(self):
        input_text = u"lower newer"
-        output_text = u" lower newer"
+        output_text = u"lower newer"
        return input_text, output_text
    def test_full_tokenizer(self):
        tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
        self.assertListEqual(tokens, bpe_tokens)
        input_tokens = tokens + [tokenizer.unk_token]
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
        encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
        encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
        assert encoded_sentence == encoded_text_from_decode
        assert encoded_pair == encoded_pair_from_decode

--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -186,3 +186,92 @@ class CommonTestCases:
            for weights_list_2 in weights_lists_2:
                self.assertListEqual(weights_list, weights_list_2)
+        def test_mask_output(self):
+            if sys.version_info <= (3, 0):
+                return
+            tokenizer = self.get_tokenizer()
+            if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
+                seq_0 = "Test this method."
+                seq_1 = "With these inputs."
+                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
+                sequences, mask = information["input_ids"], information["token_type_ids"]
+                assert len(sequences) == len(mask)
+        def test_number_of_added_tokens(self):
+            tokenizer = self.get_tokenizer()
+            seq_0 = "Test this method."
+            seq_1 = "With these inputs."
+            sequences = tokenizer.encode(seq_0, seq_1)
+            attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+            # Method is implemented (e.g. not GPT-2)
+            if len(attached_sequences) != 2:
+                assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences)
+        def test_maximum_encoding_length_single_input(self):
+            tokenizer = self.get_tokenizer()
+            seq_0 = "This is a sentence to be encoded."
+            stride = 2
+            sequence = tokenizer.encode(seq_0)
+            num_added_tokens = tokenizer.num_added_tokens()
+            total_length = len(sequence) + num_added_tokens
+            information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
+            truncated_sequence = information["input_ids"]
+            overflowing_tokens = information["overflowing_tokens"]
+            assert len(overflowing_tokens) == 2 + stride
+            assert overflowing_tokens == sequence[-(2 + stride):]
+            assert len(truncated_sequence) == total_length - 2
+            assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
+        def test_maximum_encoding_length_pair_input(self):
+            tokenizer = self.get_tokenizer()
+            seq_0 = "This is a sentence to be encoded."
+            seq_1 = "This is another sentence to be encoded."
+            stride = 2
+            sequence_0_no_special_tokens = tokenizer.encode(seq_0)
+            sequence_1_no_special_tokens = tokenizer.encode(seq_1)
+            sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+            truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair(
+                tokenizer.encode(seq_0),
+                tokenizer.encode(seq_1)[:-2]
+            )
+            information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
+                                                stride=stride, truncate_first_sequence=False)
+            information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
+                                                                add_special_tokens=True, stride=stride,
+                                                                truncate_first_sequence=True)
+            truncated_sequence = information["input_ids"]
+            overflowing_tokens = information["overflowing_tokens"]
+            overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
+            assert len(overflowing_tokens) == 2 + stride
+            assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):]
+            assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):]
+            assert len(truncated_sequence) == len(sequence) - 2
+            assert truncated_sequence == truncated_second_sequence
+        def test_encode_input_type(self):
+            tokenizer = self.get_tokenizer()
+            sequence = "Let's encode this sequence"
+            tokens = tokenizer.tokenize(sequence)
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
+            assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
+            assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -16,15 +16,22 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
+import pytest
 from io import open
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
+from transformers import is_torch_available
-from.tokenization_tests_commons import CommonTestCases
+if is_torch_available():
+    import torch
+    from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
+else:
+    pytestmark = pytest.mark.skip("Require Torch")  # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
+from .tokenization_tests_commons import CommonTestCases
 class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
-    tokenizer_class = TransfoXLTokenizer
+    tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
    def setUp(self):
        super(TransfoXLTokenizationTest, self).setUp()

--- a/pytorch_transformers/tests/tokenization_utils_test.py
+++ b/pytorch_transformers/tests/tokenization_utils_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 import unittest
 import six
-from pytorch_transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from transformers.tokenization_gpt2 import GPT2Tokenizer
 class TokenizerUtilsTest(unittest.TestCase):
    def check_tokenizer_from_pretrained(self, tokenizer_class):