Merge pull request #2255 from aaugustin/implement-best-practices

Implement some Python best practices

Merge pull request #2255 from aaugustin/implement-best-practices
Implement some Python best practices
54abc67a · Thomas Wolf · GitHub · 645713e2 · c11b3e29 · 54abc67a
Unverified Commit 54abc67a authored Dec 22, 2019 by Thomas Wolf Committed by GitHub Dec 22, 2019
20 changed files
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -12,22 +12,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
-    from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
+    from transformers import (
-                              AlbertForSequenceClassification, AlbertForQuestionAnswering,
+        AlbertConfig,
-                              )
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+    )
    from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -37,33 +40,33 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
    class AlbertModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     embedding_size=16,
+            embedding_size=16,
-                     hidden_size=36,
+            hidden_size=36,
-                     num_hidden_layers=6,
+            num_hidden_layers=6,
-                     num_hidden_groups=6,
+            num_hidden_groups=6,
-                     num_attention_heads=6,
+            num_attention_heads=6,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -120,16 +123,17 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range,
-                num_hidden_groups=self.num_hidden_groups)
+                num_hidden_groups=self.num_hidden_groups,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
-        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = AlbertModel(config=config)
            model.to(torch_device)
            model.eval()
@@ -142,66 +146,79 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+        def create_and_check_albert_for_masked_lm(
-        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = AlbertForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
-        def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = AlbertForQuestionAnswering(config=config)
            model.to(torch_device)
            model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+            loss, start_logits, end_logits = model(
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
            result = {
                "loss": loss,
                "start_logits": start_logits,
                "end_logits": end_logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-                list(result["start_logits"].size()),
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
            self.check_loss_output(result)
+        def create_and_check_albert_for_sequence_classification(
-        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = AlbertForSequenceClassification(config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
            self.check_loss_output(result)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -233,5 +250,6 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
            model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -12,29 +12,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
-import unittest
-import shutil
 import logging
+import unittest
 from transformers import is_torch_available
-from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
+from .utils import SMALL_MODEL_IDENTIFIER, require_torch, slow
 if is_torch_available():
-    from transformers import (AutoConfig, BertConfig,
+    from transformers import (
-                                    AutoModel, BertModel,
+        AutoConfig,
-                                    AutoModelWithLMHead, BertForMaskedLM,
+        BertConfig,
-                                    AutoModelForSequenceClassification, BertForSequenceClassification,
+        AutoModel,
-                                    AutoModelForQuestionAnswering, BertForQuestionAnswering)
+        BertModel,
+        AutoModelWithLMHead,
+        BertForMaskedLM,
+        AutoModelForSequenceClassification,
+        BertForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        BertForQuestionAnswering,
+    )
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    from .modeling_common_test import (CommonTestCases, ids_tensor)
-    from .configuration_common_test import ConfigTester
 @require_torch
 class AutoModelTest(unittest.TestCase):
@@ -75,7 +77,9 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsInstance(config, BertConfig)
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(
+                model_name, output_loading_info=True
+            )
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForSequenceClassification)

--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -12,59 +12,75 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, floats_tensor, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
-    from transformers import (BertConfig, BertModel, BertForMaskedLM,
+    from transformers import (
-                              BertForNextSentencePrediction, BertForPreTraining,
+        BertConfig,
-                              BertForQuestionAnswering, BertForSequenceClassification,
+        BertModel,
-                              BertForTokenClassification, BertForMultipleChoice)
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForMultipleChoice,
+    )
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 @require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+    all_model_classes = (
-                         BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+        (
-                         BertForTokenClassification) if is_torch_available() else ()
+            BertModel,
+            BertForMaskedLM,
+            BertForNextSentencePrediction,
+            BertForPreTraining,
+            BertForQuestionAnswering,
+            BertForSequenceClassification,
+            BertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
    class BertModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -119,25 +135,44 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                is_decoder=False,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        def prepare_config_and_inputs_for_decoder(self):
-            config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = self.prepare_config_and_inputs()
            config.is_decoder = True
            encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
            encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+                encoder_hidden_states,
+                encoder_attention_mask,
+            )
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
-        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = BertModel(config=config)
            model.to(torch_device)
            model.eval()
@@ -150,16 +185,38 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-        def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+        def create_and_check_bert_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ):
            model = BertModel(config)
            model.to(torch_device)
            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
+            sequence_output, pooled_output = model(
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            sequence_output, pooled_output = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                encoder_hidden_states=encoder_hidden_states,
+            )
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            result = {
@@ -167,122 +224,171 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = BertForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
-        def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+        def create_and_check_bert_model_for_masked_lm_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ):
            model = BertForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
+            loss, prediction_scores = model(
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            loss, prediction_scores = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                encoder_hidden_states=encoder_hidden_states,
+            )
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
-        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = BertForNextSentencePrediction(config=config)
            model.to(torch_device)
            model.eval()
-            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
+            loss, seq_relationship_score = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                next_sentence_label=sequence_labels,
+            )
            result = {
                "loss": loss,
                "seq_relationship_score": seq_relationship_score,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
            self.check_loss_output(result)
-        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = BertForPreTraining(config=config)
            model.to(torch_device)
            model.eval()
-            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+            loss, prediction_scores, seq_relationship_score = model(
-                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                masked_lm_labels=token_labels,
+                next_sentence_label=sequence_labels,
+            )
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
                "seq_relationship_score": seq_relationship_score,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
            self.check_loss_output(result)
-        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = BertForQuestionAnswering(config=config)
            model.to(torch_device)
            model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+            loss, start_logits, end_logits = model(
-                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
            result = {
                "loss": loss,
                "start_logits": start_logits,
                "end_logits": end_logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-                list(result["start_logits"].size()),
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
            self.check_loss_output(result)
-        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = BertForSequenceClassification(config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
            self.check_loss_output(result)
-        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = BertForTokenClassification(config=config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
            self.parent.assertListEqual(
-                list(result["logits"].size()),
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
            self.check_loss_output(result)
-        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_choices = self.num_choices
            model = BertForMultipleChoice(config=config)
            model.to(torch_device)
@@ -290,24 +396,31 @@ class BertModelTest(CommonTestCases.CommonModelTester):
            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(multiple_choice_inputs_ids,
+            loss, logits = model(
-                                 attention_mask=multiple_choice_input_mask,
+                multiple_choice_inputs_ids,
-                                 token_type_ids=multiple_choice_token_type_ids,
+                attention_mask=multiple_choice_input_mask,
-                                 labels=choice_labels)
+                token_type_ids=multiple_choice_token_type_ids,
+                labels=choice_labels,
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
-                list(result["logits"].size()),
-                [self.batch_size, self.num_choices])
            self.check_loss_output(result)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):

--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -12,58 +12,64 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import copy
-import sys
+import json
+import logging
 import os.path
+import random
 import shutil
+import sys
 import tempfile
-import json
-import random
-import uuid
 import unittest
-import logging
+import uuid
 from transformers import is_torch_available
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
    import torch
    import numpy as np
-    from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
+    from transformers import (
-                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        AdaptiveEmbedding,
-                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+        PretrainedConfig,
+        PreTrainedModel,
+        BertModel,
+        BertConfig,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 if sys.version_info[0] == 2:
-    import cPickle as pickle
    class TemporaryDirectory(object):
        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
        def __enter__(self):
            self.name = tempfile.mkdtemp()
            return self.name
        def __exit__(self, exc_type, exc_value, traceback):
            shutil.rmtree(self.name)
 else:
-    import pickle
    TemporaryDirectory = tempfile.TemporaryDirectory
    unicode = str
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key or 'initializer_factor' in key:
+        if "_range" in key or "_std" in key or "initializer_factor" in key:
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
-class CommonTestCases:
+class CommonTestCases:
    @require_torch
    class CommonModelTester(unittest.TestCase):
@@ -108,8 +114,11 @@ class CommonTestCases:
                model = model_class(config=configs_no_init)
                for name, param in model.named_parameters():
                    if param.requires_grad:
-                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
+                        self.assertIn(
-                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+                            param.data.mean().item(),
+                            [0.0, 1.0],
+                            msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
+                        )
        def test_determinism(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -131,10 +140,22 @@ class CommonTestCases:
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            decoder_seq_length = (
-            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+                self.model_tester.decoder_seq_length
-            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+                if hasattr(self.model_tester, "decoder_seq_length")
-            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+                else self.model_tester.seq_length
+            )
+            encoder_seq_length = (
+                self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "encoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            decoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+            )
+            encoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+            )
            for model_class in self.all_model_classes:
                config.output_attentions = True
@@ -150,23 +171,20 @@ class CommonTestCases:
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                    encoder_seq_length ,
+                )
-                    encoder_key_length])
                out_len = len(outputs)
                if self.is_encoder_decoder:
                    self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2)-1]
+                    decoder_attentions = outputs[(out_len // 2) - 1]
                    self.assertEqual(model.config.output_attentions, True)
                    self.assertEqual(model.config.output_hidden_states, False)
                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                    self.assertListEqual(
                        list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads,
+                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                         decoder_seq_length,
+                    )
-                         decoder_key_length
-                         ])
                # Check attention is always last and order is fine
                config.output_attentions = True
@@ -184,9 +202,8 @@ class CommonTestCases:
                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                    encoder_seq_length,
+                )
-                    encoder_key_length])
        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -215,7 +232,7 @@ class CommonTestCases:
                model = model_class(config=configs_no_init)
                model.to(torch_device)
                model.eval()
-                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
+                inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
                try:
                    traced_gpt2 = torch.jit.trace(model, inputs)
@@ -269,12 +286,14 @@ class CommonTestCases:
                # Prepare head_mask
                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
+                head_mask = torch.ones(
+                    self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
+                )
                head_mask[0, 0] = 0
                head_mask[-1, :-1] = 0
                head_mask.requires_grad_(requires_grad=True)
                inputs = inputs_dict.copy()
-                inputs['head_mask'] = head_mask
+                inputs["head_mask"] = head_mask
                outputs = model(**inputs)
@@ -289,21 +308,20 @@ class CommonTestCases:
                # Remove Nan
                for t in attentions:
-                    self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4)  # Check we don't have more than 25% nans (arbitrary)
+                    self.assertLess(
-                attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions]  # remove them (the test is less complete)
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
                self.assertIsNotNone(multihead_outputs)
                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-                self.assertAlmostEqual(
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
+                self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(
-                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(
-                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
        def test_head_pruning(self):
            if not self.test_pruning:
@@ -320,20 +338,16 @@ class CommonTestCases:
                model = model_class(config=config)
                model.to(torch_device)
                model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-                                -1: [0]}
                model.prune_heads(heads_to_prune)
                with torch.no_grad():
                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
-                self.assertEqual(
+                self.assertEqual(attentions[0].shape[-3], 1)
-                    attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(
-                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
        def test_head_pruning_save_load_from_pretrained(self):
            if not self.test_pruning:
@@ -350,8 +364,7 @@ class CommonTestCases:
                model = model_class(config=config)
                model.to(torch_device)
                model.eval()
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-                                -1: [0]}
                model.prune_heads(heads_to_prune)
                with TemporaryDirectory() as temp_dir_name:
@@ -366,7 +379,6 @@ class CommonTestCases:
                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
        def test_head_pruning_save_load_from_config_init(self):
            if not self.test_pruning:
                return
@@ -380,8 +392,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
-                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-                                 -1: [0]}
                config.pruned_heads = heads_to_prune
                model = model_class(config=config)
@@ -446,7 +457,7 @@ class CommonTestCases:
                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
-                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
@@ -470,8 +481,13 @@ class CommonTestCases:
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                    [
-                     self.model_tester.hidden_size])
+                        self.model_tester.encoder_seq_length
+                        if hasattr(self.model_tester, "encoder_seq_length")
+                        else self.model_tester.seq_length,
+                        self.model_tester.hidden_size,
+                    ],
+                )
        def test_resize_tokens_embeddings(self):
            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -512,15 +528,10 @@ class CommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
-                self.assertIsInstance(
+                self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
-                    model.get_input_embeddings(),
-                    (torch.nn.Embedding, AdaptiveEmbedding)
-                )
                model.set_input_embeddings(torch.nn.Embedding(10, 10))
                x = model.get_output_embeddings()
-                self.assertTrue(
+                self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
-                    x is None or isinstance(x, torch.nn.Linear)
-                )
        def test_tie_model_weights(self):
            if not self.test_torchscript:
@@ -602,30 +613,30 @@ class CommonTestCases:
                    outputs = model(**inputs_dict)
    class GPTModelTester(CommonModelTester):
+        def __init__(
-        def __init__(self,
+            self,
-                        parent,
+            parent,
-                        batch_size=13,
+            batch_size=13,
-                        seq_length=7,
+            seq_length=7,
-                        is_training=True,
+            is_training=True,
-                        use_position_ids=True,
+            use_position_ids=True,
-                        use_token_type_ids=True,
+            use_token_type_ids=True,
-                        use_labels=True,
+            use_labels=True,
-                        vocab_size=99,
+            vocab_size=99,
-                        n_positions=33,
+            n_positions=33,
-                        hidden_size=32,
+            hidden_size=32,
-                        num_hidden_layers=5,
+            num_hidden_layers=5,
-                        num_attention_heads=4,
+            num_attention_heads=4,
-                        n_choices=3,
+            n_choices=3,
-                        type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                        initializer_range=0.02,
+            initializer_range=0.02,
-                        num_labels=3,
+            num_labels=3,
-                        scope=None,
+            scope=None,
-                        config_class=None,
+            config_class=None,
-                        base_model_class=None,
+            base_model_class=None,
-                        lm_head_model_class=None,
+            lm_head_model_class=None,
-                        double_head_model_class=None,
+            double_head_model_class=None,
-                        ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -676,13 +687,14 @@ class CommonTestCases:
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
-            return (config, input_ids, token_type_ids, position_ids,
+            return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids)
-                    mc_labels, lm_labels, mc_token_ids)
-        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
+        def create_and_check_base_model(
-                                mc_labels, lm_labels, mc_token_ids):
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
            model = self.base_model_class(config)
            model.to(torch_device)
            model.eval()
@@ -694,12 +706,12 @@ class CommonTestCases:
            hidden_state = outputs[0]
            self.parent.assertListEqual(
-                list(hidden_state.size()),
+                list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]
-                [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
+            )
+        def create_and_check_lm_head(
-        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
-                                        mc_labels, lm_labels, mc_token_ids):
+        ):
            model = self.lm_head_model_class(config)
            model.to(torch_device)
            model.eval()
@@ -709,14 +721,13 @@ class CommonTestCases:
            total_voc = self.vocab_size
            self.parent.assertListEqual(
-                list(lm_logits.size()),
+                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(loss.size()), [])
-                list(loss.size()),
-                [])
-        def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
+        def create_and_check_presents(
-                                        mc_labels, lm_labels, mc_token_ids):
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
            for model_class in self.all_model_classes:
                model = model_class(config)
                model.to(torch_device)
@@ -727,30 +738,39 @@ class CommonTestCases:
                self.parent.assertEqual(self.num_hidden_layers, len(presents))
                self.parent.assertListEqual(
                    list(presents[0].size()),
-                    [2, self.batch_size * self.n_choices, self.num_attention_heads,
+                    [
-                        self.seq_length, self.hidden_size // self.num_attention_heads])
+                        2,
+                        self.batch_size * self.n_choices,
+                        self.num_attention_heads,
+                        self.seq_length,
+                        self.hidden_size // self.num_attention_heads,
+                    ],
+                )
-        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
+        def create_and_check_double_heads(
-                                        mc_labels, lm_labels, mc_token_ids):
+            self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids
+        ):
            model = self.double_head_model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
-                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+                outputs = model(
-                            token_type_ids=token_type_ids, position_ids=position_ids)
+                    input_ids,
+                    mc_token_ids,
+                    lm_labels=lm_labels,
+                    mc_labels=mc_labels,
+                    token_type_ids=token_type_ids,
+                    position_ids=position_ids,
+                )
            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
            loss = [lm_loss, mc_loss]
            total_voc = self.vocab_size
            self.parent.assertListEqual(
-                list(lm_logits.size()),
+                list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices])
-                list(mc_logits.size()),
+            self.parent.assertListEqual([list(l.size()) for l in loss], [[], []])
-                [self.batch_size, self.n_choices])
-            self.parent.assertListEqual(
-                [list(l.size()) for l in loss],
-                [[], []])
        def create_and_check_model_from_pretrained(self):
            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
@@ -759,9 +779,8 @@ class CommonTestCases:
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, position_ids,
+            (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs
-                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids}
-            inputs_dict = {'input_ids': input_ids}
            return config, inputs_dict
        def run_common_tests(self, test_presents=False):
@@ -791,10 +810,10 @@ class ConfigTester(object):
    def create_and_test_config_common_properties(self):
        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, 'vocab_size'))
+        self.parent.assertTrue(hasattr(config, "vocab_size"))
-        self.parent.assertTrue(hasattr(config, 'hidden_size'))
+        self.parent.assertTrue(hasattr(config, "hidden_size"))
-        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
-        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
    def create_and_test_config_to_json_string(self):
        config = self.config_class(**self.inputs_dict)

--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -11,24 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import pdb
 from transformers import is_torch_available
-if is_torch_available():
-    from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    CTRLLMHeadModel)
-from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel
 @require_torch
 class CTRLModelTest(CommonTestCases.CommonModelTester):
@@ -39,32 +36,32 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
    test_head_masking = False
    class CTRLModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_labels=True,
+            use_labels=True,
-                     use_mc_token_ids=True,
+            use_mc_token_ids=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -129,12 +126,20 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = CTRLModel(config=config)
@@ -150,8 +155,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                "presents": presents,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertEqual(len(result["presents"]), config.n_layer)
        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -161,29 +166,28 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-            result = {
+            result = {"loss": loss, "lm_logits": lm_logits}
-                "loss": loss,
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                "lm_logits": lm_logits
-            }
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, head_mask, token_type_ids,
+            (
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
+                input_ids,
-            inputs_dict = {
+                input_mask,
-                'input_ids': input_ids,
+                head_mask,
-                'token_type_ids': token_type_ids,
+                token_type_ids,
-                'head_mask': head_mask
+                mc_token_ids,
-            }
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
            return config, inputs_dict

--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -12,60 +12,67 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-if is_torch_available():
-    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
-                                    DistilBertForTokenClassification,
-                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import require_torch, torch_device
+if is_torch_available():
+    from transformers import (
+        DistilBertConfig,
+        DistilBertModel,
+        DistilBertForMaskedLM,
+        DistilBertForTokenClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForSequenceClassification,
+    )
 @require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
+    all_model_classes = (
-                         DistilBertForSequenceClassification) if is_torch_available() else None
+        (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+        if is_torch_available()
+        else None
+    )
    test_pruning = True
    test_torchscript = True
    test_resize_embeddings = True
    test_head_masking = True
    class DistilBertModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=False,
+            use_token_type_ids=False,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -114,16 +121,17 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                dropout=self.hidden_dropout_prob,
                attention_dropout=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
-        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = DistilBertModel(config=config)
            model.to(torch_device)
            model.eval()
@@ -134,10 +142,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                "sequence_output": sequence_output,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
-        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = DistilBertForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
@@ -147,29 +157,31 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
-        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = DistilBertForQuestionAnswering(config=config)
            model.to(torch_device)
            model.eval()
-            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
+            loss, start_logits, end_logits = model(
+                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+            )
            result = {
                "loss": loss,
                "start_logits": start_logits,
                "end_logits": end_logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-                list(result["start_logits"].size()),
+            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
            self.check_loss_output(result)
-        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = DistilBertForSequenceClassification(config)
            model.to(torch_device)
@@ -179,12 +191,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                "loss": loss,
                "logits": logits,
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
            self.check_loss_output(result)
-        def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_token_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = DistilBertForTokenClassification(config=config)
            model.to(torch_device)
@@ -196,14 +208,14 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                "logits": logits,
            }
            self.parent.assertListEqual(
-                list(result["logits"].size()),
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
            self.check_loss_output(result)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -239,5 +251,6 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
    #         self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -17,8 +17,10 @@ import logging
 import unittest
 from transformers import is_torch_available
 from .utils import require_torch, slow
 if is_torch_available():
    from transformers import BertModel, BertForMaskedLM, Model2Model
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -39,13 +41,13 @@ class EncoderDecoderModelTest(unittest.TestCase):
    def test_model2model_from_pretrained_not_bert(self):
        logging.basicConfig(level=logging.INFO)
        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('roberta')
+            _ = Model2Model.from_pretrained("roberta")
        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('distilbert')
+            _ = Model2Model.from_pretrained("distilbert")
        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained('does-not-exist')
+            _ = Model2Model.from_pretrained("does-not-exist")
 if __name__ == "__main__":

--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -12,55 +12,59 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-if is_torch_available():
-    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import (
+        GPT2Config,
+        GPT2Model,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+    )
 @require_torch
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
    class GPT2ModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_labels=True,
+            use_labels=True,
-                     use_mc_token_ids=True,
+            use_mc_token_ids=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -125,12 +129,20 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2Model(config=config)
@@ -146,8 +158,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                "presents": presents,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertEqual(len(result["presents"]), config.n_layer)
        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
@@ -157,63 +169,58 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-            result = {
+            result = {"loss": loss, "lm_logits": lm_logits}
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
-                list(result["loss"].size()),
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [])
+            )
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_double_lm_head_model(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
            model = GPT2DoubleHeadsModel(config)
            model.to(torch_device)
            model.eval()
            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            inputs = {'input_ids': multiple_choice_inputs_ids,
+            inputs = {
-                      'mc_token_ids': mc_token_ids,
+                "input_ids": multiple_choice_inputs_ids,
-                      'attention_mask': multiple_choice_input_mask,
+                "mc_token_ids": mc_token_ids,
-                      'token_type_ids': multiple_choice_token_type_ids,
+                "attention_mask": multiple_choice_input_mask,
-                      'lm_labels': multiple_choice_inputs_ids}
+                "token_type_ids": multiple_choice_token_type_ids,
+                "lm_labels": multiple_choice_inputs_ids,
+            }
            loss, lm_logits, mc_logits, _ = model(**inputs)
-            result = {
+            result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits
-            }
+            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
-                list(result["loss"].size()),
+                list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-                [])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.num_choices])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, head_mask, token_type_ids,
+            (
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
+                input_ids,
-            inputs_dict = {
+                input_mask,
-                'input_ids': input_ids,
+                head_mask,
-                'token_type_ids': token_type_ids,
+                token_type_ids,
-                'head_mask': head_mask
+                mc_token_ids,
-            }
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
            return config, inputs_dict

--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -12,53 +12,59 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-if is_torch_available():
-    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import (
+        OpenAIGPTConfig,
+        OpenAIGPTModel,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTDoubleHeadsModel,
+    )
 @require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
+    all_model_classes = (
+        (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
+    )
    class OpenAIGPTModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -116,9 +122,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTModel(config=config)
@@ -129,12 +133,10 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
            model(input_ids, token_type_ids=token_type_ids)
            (sequence_output,) = model(input_ids)
-            result = {
+            result = {"sequence_output": sequence_output}
-                "sequence_output": sequence_output
-            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTLMHeadModel(config)
@@ -143,17 +145,12 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-            result = {
+            result = {"loss": loss, "lm_logits": lm_logits}
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
-                list(result["loss"].size()),
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [])
+            )
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTDoubleHeadsModel(config)
@@ -162,26 +159,25 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
-            result = {
+            result = {"loss": loss, "lm_logits": lm_logits}
-                "loss": loss,
-                "lm_logits": lm_logits
-            }
+            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
-                list(result["loss"].size()),
+                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [])
+            )
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (
-            inputs_dict = {
+                config,
-                'input_ids': input_ids,
+                input_ids,
-                'token_type_ids': token_type_ids,
+                head_mask,
-                'head_mask': head_mask
+                token_type_ids,
-            }
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
            return config, inputs_dict

--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -12,25 +12,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
+from .configuration_common_test import ConfigTester
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 if is_torch_available():
    import torch
-    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
+    from transformers import (
-                              RobertaForSequenceClassification, RobertaForTokenClassification)
+        RobertaConfig,
+        RobertaModel,
+        RobertaForMaskedLM,
+        RobertaForSequenceClassification,
+        RobertaForTokenClassification,
+    )
    from transformers.modeling_roberta import RobertaEmbeddings
    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import (CommonTestCases, ids_tensor)
-from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
 @require_torch
 class RobertaModelTest(CommonTestCases.CommonModelTester):
@@ -38,31 +42,31 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
    class RobertaModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -116,17 +120,17 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
-        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+        def create_and_check_roberta_model(
-                                           token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = RobertaModel(config=config)
            model.to(torch_device)
            model.eval()
@@ -139,47 +143,59 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+        def create_and_check_roberta_for_masked_lm(
-                                                   token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = RobertaForMaskedLM(config=config)
            model.to(torch_device)
            model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
-        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask,
+        def create_and_check_roberta_for_token_classification(
-                                                              sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = RobertaForTokenClassification(config=config)
            model.to(torch_device)
            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+            loss, logits = model(
-                                 labels=token_labels)
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
            result = {
                "loss": loss,
                "logits": logits,
            }
            self.parent.assertListEqual(
-                list(result["logits"].size()),
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
            self.check_loss_output(result)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -214,18 +230,12 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        model = RobertaEmbeddings(config=config)
        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor([[
+        expected_positions = torch.as_tensor(
-            0 + model.padding_idx + 1,
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-            1 + model.padding_idx + 1,
+        )
-            2 + model.padding_idx + 1,
-            model.padding_idx
-        ]])
        position_ids = model.create_position_ids_from_input_ids(input_ids)
-        self.assertEqual(
+        self.assertEqual(position_ids.shape, expected_positions.shape)
-            position_ids.shape,
-            expected_positions.shape
-        )
        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
    def test_create_position_ids_from_inputs_embeds(self):
@@ -247,69 +257,47 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        ]
        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(
+        self.assertEqual(position_ids.shape, expected_positions.shape)
-            position_ids.shape,
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-            expected_positions.shape
-        )
-        self.assertTrue(
-            torch.all(torch.eq(position_ids, expected_positions))
-        )
 class RobertaModelIntegrationTest(unittest.TestCase):
    @slow
    def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        model = RobertaForMaskedLM.from_pretrained("roberta-base")
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(
+        self.assertEqual(output.shape, expected_shape)
-            output.shape,
-            expected_shape
-        )
        # compare the actual values for a slice.
        expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779],
+            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
    @slow
    def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained('roberta-base')
+        model = RobertaModel.from_pretrained("roberta-base")
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        # compare the actual values for a slice.
        expected_slice = torch.Tensor(
-            [[[-0.0231,  0.0782,  0.0074],
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
    @slow
    def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 3))
-        self.assertEqual(
+        self.assertEqual(output.shape, expected_shape)
-            output.shape,
+        expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]])
-            expected_shape
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3))
-        )
-        expected_tensor = torch.Tensor([[-0.9469,  0.3913,  0.5118]])
-        self.assertTrue(
-            torch.allclose(output, expected_tensor, atol=1e-3)
-        )
 if __name__ == "__main__":

--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -12,20 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
 from transformers import is_torch_available
-from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_torch, slow, torch_device
+from .modeling_common_test import CommonTestCases, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow
 if is_torch_available():
-    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers import T5Config, T5Model, T5WithLMHeadModel
    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -39,26 +38,26 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
    is_encoder_decoder = True
    class T5ModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     encoder_seq_length=7,
+            encoder_seq_length=7,
-                     decoder_seq_length=9,
+            decoder_seq_length=9,
-                     is_training=True,
+            is_training=True,
-                     use_attention_mask=True,
+            use_attention_mask=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     n_positions=14,
+            n_positions=14,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     d_ff=37,
+            d_ff=37,
-                     relative_attention_num_buckets=8,
+            relative_attention_num_buckets=8,
-                     dropout_rate=0.1,
+            dropout_rate=0.1,
-                     initializer_factor=0.002,
+            initializer_factor=0.002,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.encoder_seq_length = encoder_seq_length
@@ -101,60 +100,96 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                num_heads=self.num_attention_heads,
                relative_attention_num_buckets=self.relative_attention_num_buckets,
                dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor)
+                initializer_factor=self.initializer_factor,
+            )
-            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+            return (
+                config,
+                encoder_input_ids,
+                decoder_input_ids,
+                encoder_attention_mask,
+                decoder_attention_mask,
+                decoder_lm_labels,
+            )
        def check_loss_output(self, result):
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["loss"].size()), [])
-                list(result["loss"].size()),
-                [])
+        def create_and_check_t5_model(
+            self,
-        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            config,
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attention_mask,
+            decoder_attention_mask,
+            decoder_lm_labels,
+        ):
            model = T5Model(config=config)
            model.eval()
-            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+            decoder_output, encoder_output = model(
-                                                   decoder_input_ids=decoder_input_ids,
+                encoder_input_ids=encoder_input_ids,
-                                                   encoder_attention_mask=encoder_attention_mask,
+                decoder_input_ids=decoder_input_ids,
-                                                   decoder_attention_mask=decoder_attention_mask)
+                encoder_attention_mask=encoder_attention_mask,
-            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
-                                                   decoder_input_ids=decoder_input_ids)
+            )
+            decoder_output, encoder_output = model(
+                encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids
+            )
            result = {
                "encoder_output": encoder_output,
                "decoder_output": decoder_output,
            }
            self.parent.assertListEqual(
-                list(result["encoder_output"].size()),
+                list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
-                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(
-                list(result["decoder_output"].size()),
+                list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
-                [self.batch_size, self.decoder_seq_length, self.hidden_size])
+            )
+        def create_and_check_t5_with_lm_head(
-        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            self,
+            config,
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attention_mask,
+            decoder_attention_mask,
+            decoder_lm_labels,
+        ):
            model = T5WithLMHeadModel(config=config)
            model.eval()
-            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
+            outputs = model(
-                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+                encoder_input_ids=encoder_input_ids,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                decoder_lm_labels=decoder_lm_labels,
+            )
            loss, prediction_scores = outputs[0], outputs[1]
            result = {
                "loss": loss,
                "prediction_scores": prediction_scores,
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
+                list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
-                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+            )
            self.check_loss_output(result)
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
+            (
-             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
+                config,
-            inputs_dict = {'encoder_input_ids': encoder_input_ids,
+                encoder_input_ids,
-                           'decoder_input_ids': decoder_input_ids,
+                decoder_input_ids,
-                           'decoder_attention_mask': decoder_attention_mask,
+                encoder_attention_mask,
-                           'encoder_attention_mask': encoder_attention_mask}
+                decoder_attention_mask,
+                decoder_lm_labels,
+            ) = config_and_inputs
+            inputs_dict = {
+                "encoder_input_ids": encoder_input_ids,
+                "decoder_input_ids": decoder_input_ids,
+                "decoder_attention_mask": decoder_attention_mask,
+                "encoder_attention_mask": encoder_attention_mask,
+            }
            return config, inputs_dict
    def setUp(self):
@@ -178,5 +213,6 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
            model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -12,62 +12,60 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import sys
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import AlbertConfig, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import AlbertConfig, is_tf_available
 if is_tf_available():
-    import tensorflow as tf
+    from transformers.modeling_tf_albert import (
-    from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
+        TFAlbertModel,
-                                                 TFAlbertForSequenceClassification,
+        TFAlbertForMaskedLM,
-                                                 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 @require_tf
 class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (
-        TFAlbertModel,
+        (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else ()
-        TFAlbertForMaskedLM,
+    )
-        TFAlbertForSequenceClassification
-    ) if is_tf_available() else ()
    class TFAlbertModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     embedding_size=16,
+            embedding_size=16,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -93,27 +91,22 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
            self.scope = scope
        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor(
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-                [self.batch_size, self.seq_length], self.vocab_size)
            input_mask = None
            if self.use_input_mask:
-                input_mask = ids_tensor(
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-                    [self.batch_size, self.seq_length], vocab_size=2)
            token_type_ids = None
            if self.use_token_type_ids:
-                token_type_ids = ids_tensor(
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-                    [self.batch_size, self.seq_length], self.type_vocab_size)
            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
-                sequence_labels = ids_tensor(
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                    [self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                token_labels = ids_tensor(
-                    [self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = AlbertConfig(
@@ -127,19 +120,20 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFAlbertModel(config=config)
            # inputs = {'input_ids': input_ids,
            #           'attention_mask': input_mask,
            #           'token_type_ids': token_type_ids}
            # sequence_output, pooled_output = model(**inputs)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output, pooled_output = model(inputs)
            inputs = [input_ids, input_mask]
@@ -152,50 +146,52 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                "pooled_output": pooled_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
-                                        self.batch_size, self.hidden_size])
-        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFAlbertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (prediction_scores,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_albert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFAlbertForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (logits,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids,
+                input_ids,
-                           'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
        self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
-        self.config_tester = ConfigTester(
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
-            self, config_class=AlbertConfig, hidden_size=37)
    def test_config(self):
        self.config_tester.run_common_tests()
@@ -206,13 +202,11 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
    def test_for_masked_lm(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_masked_lm(
+        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
-            *config_and_inputs)
    def test_for_sequence_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_sequence_classification(
+        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
-            *config_and_inputs)
    @slow
    def test_model_from_pretrained(self):

--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -12,28 +12,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
-import unittest
-import shutil
 import logging
+import unittest
 from transformers import is_tf_available
-from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
+from .utils import SMALL_MODEL_IDENTIFIER, require_tf, slow
-if is_tf_available():
-    from transformers import (AutoConfig, BertConfig,
-                                      TFAutoModel, TFBertModel,
-                                      TFAutoModelWithLMHead, TFBertForMaskedLM,
-                                      TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
-                                      TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
-    from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    from .modeling_common_test import (CommonTestCases, ids_tensor)
+if is_tf_available():
-    from .configuration_common_test import ConfigTester
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        TFAutoModel,
+        TFBertModel,
+        TFAutoModelWithLMHead,
+        TFBertForMaskedLM,
+        TFAutoModelForSequenceClassification,
+        TFBertForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFBertForQuestionAnswering,
+    )
 @require_tf
@@ -41,11 +42,12 @@ class TFAutoModelTest(unittest.TestCase):
    @slow
    def test_model_from_pretrained(self):
        import h5py
        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
@@ -58,7 +60,7 @@ class TFAutoModelTest(unittest.TestCase):
    def test_lmhead_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
@@ -71,7 +73,7 @@ class TFAutoModelTest(unittest.TestCase):
    def test_sequence_classification_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)
@@ -84,7 +86,7 @@ class TFAutoModelTest(unittest.TestCase):
    def test_question_answering_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -12,64 +12,74 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import sys
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import BertConfig, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import BertConfig, is_tf_available
 if is_tf_available():
    import tensorflow as tf
-    from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
+    from transformers.modeling_tf_bert import (
-                                                       TFBertForNextSentencePrediction,
+        TFBertModel,
-                                                       TFBertForPreTraining,
+        TFBertForMaskedLM,
-                                                       TFBertForSequenceClassification,
+        TFBertForNextSentencePrediction,
-                                                       TFBertForMultipleChoice,
+        TFBertForPreTraining,
-                                                       TFBertForTokenClassification,
+        TFBertForSequenceClassification,
-                                                       TFBertForQuestionAnswering,
+        TFBertForMultipleChoice,
-                                                       TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+    )
 @require_tf
 class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
-    all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
+    all_model_classes = (
-                         TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification,
+        (
-                         TFBertForTokenClassification) if is_tf_available() else ()
+            TFBertModel,
+            TFBertForMaskedLM,
+            TFBertForNextSentencePrediction,
+            TFBertForPreTraining,
+            TFBertForQuestionAnswering,
+            TFBertForSequenceClassification,
+            TFBertForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
    class TFBertModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -123,15 +133,16 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFBertModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output, pooled_output = model(inputs)
            inputs = [input_ids, input_mask]
@@ -144,128 +155,119 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                "pooled_output": pooled_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+        def create_and_check_bert_for_masked_lm(
-        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFBertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (prediction_scores,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFBertForNextSentencePrediction(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (seq_relationship_score,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            seq_relationship_score, = model(inputs)
            result = {
                "seq_relationship_score": seq_relationship_score.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
-                list(result["seq_relationship_score"].shape),
-                [self.batch_size, 2])
-        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFBertForPreTraining(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            prediction_scores, seq_relationship_score = model(inputs)
            result = {
                "prediction_scores": prediction_scores.numpy(),
                "seq_relationship_score": seq_relationship_score.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
-                list(result["seq_relationship_score"].shape),
-                [self.batch_size, 2])
+        def create_and_check_bert_for_sequence_classification(
-        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFBertForSequenceClassification(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (logits,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
+        def create_and_check_bert_for_multiple_choice(
-        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_choices = self.num_choices
            model = TFBertForMultipleChoice(config=config)
            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {'input_ids': multiple_choice_inputs_ids,
+            inputs = {
-                      'attention_mask': multiple_choice_input_mask,
+                "input_ids": multiple_choice_inputs_ids,
-                      'token_type_ids': multiple_choice_token_type_ids}
+                "attention_mask": multiple_choice_input_mask,
-            logits, = model(inputs)
+                "token_type_ids": multiple_choice_token_type_ids,
+            }
+            (logits,) = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
-                list(result["logits"].shape),
-                [self.batch_size, self.num_choices])
+        def create_and_check_bert_for_token_classification(
-        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFBertForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (logits,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["logits"].shape),
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
-        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFBertForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            start_logits, end_logits = model(inputs)
            result = {
                "start_logits": start_logits.numpy(),
                "end_logits": end_logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-                list(result["start_logits"].shape),
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -310,10 +312,10 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
    @slow
    def test_model_from_pretrained(self):
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['bert-base-uncased']:
+        for model_name in ["bert-base-uncased"]:
            model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -14,53 +14,52 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
-import os
 import copy
-import json
+import os
-import logging
-import importlib
 import random
 import shutil
-import unittest
-import uuid
-import tempfile
 import sys
+import tempfile
+import unittest
 from transformers import is_tf_available, is_torch_available
-from .utils import require_tf, slow
+from .utils import require_tf
 if is_tf_available():
    import tensorflow as tf
    import numpy as np
-    from transformers import TFPreTrainedModel
    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 if sys.version_info[0] == 2:
-    import cPickle as pickle
    class TemporaryDirectory(object):
        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
        def __enter__(self):
            self.name = tempfile.mkdtemp()
            return self.name
        def __exit__(self, exc_type, exc_value, traceback):
            shutil.rmtree(self.name)
 else:
-    import pickle
    TemporaryDirectory = tempfile.TemporaryDirectory
    unicode = str
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if "_range" in key or "_std" in key:
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
-class TFCommonTestCases:
+class TFCommonTestCases:
    @require_tf
    class TFCommonModelTester(unittest.TestCase):
@@ -126,8 +125,9 @@ class TFCommonTestCases:
                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                pt_model.eval()
-                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
+                pt_inputs_dict = dict(
-                                      for name, key in inputs_dict.items())
+                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+                )
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
                tfo = tf_model(inputs_dict, training=False)
@@ -140,18 +140,19 @@ class TFCommonTestCases:
                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
                with TemporaryDirectory() as tmpdirname:
-                    pt_checkpoint_path = os.path.join(tmpdirname, 'pt_model.bin')
+                    pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
                    torch.save(pt_model.state_dict(), pt_checkpoint_path)
                    tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-                    tf_checkpoint_path = os.path.join(tmpdirname, 'tf_model.h5')
+                    tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                    tf_model.save_weights(tf_checkpoint_path)
                    pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
                pt_model.eval()
-                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
+                pt_inputs_dict = dict(
-                                      for name, key in inputs_dict.items())
+                    (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
+                )
                with torch.no_grad():
                    pto = pt_model(**pt_inputs_dict)
                tfo = tf_model(inputs_dict)
@@ -166,13 +167,19 @@ class TFCommonTestCases:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            if self.is_encoder_decoder:
-                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
+                input_ids = {
-                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
+                    ),
+                    "encoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"
+                    ),
+                }
            else:
-                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+            metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
            for model_class in self.all_model_classes:
                # Prepare our model
@@ -188,7 +195,7 @@ class TFCommonTestCases:
                hidden_states = outputs_dict[0]
                # Add a dense layer on top to test intetgration with other keras modules
-                outputs = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(hidden_states)
+                outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
                # Compile extended model
                extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
@@ -202,7 +209,9 @@ class TFCommonTestCases:
                outputs_dict = model(inputs_dict)
                inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
+                input_ids = inputs_keywords.pop(
+                    "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None
+                )
                outputs_keywords = model(input_ids, **inputs_keywords)
                output_dict = outputs_dict[0].numpy()
@@ -213,10 +222,22 @@ class TFCommonTestCases:
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            decoder_seq_length = (
-            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+                self.model_tester.decoder_seq_length
-            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+                if hasattr(self.model_tester, "decoder_seq_length")
-            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+                else self.model_tester.seq_length
+            )
+            encoder_seq_length = (
+                self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "encoder_seq_length")
+                else self.model_tester.seq_length
+            )
+            decoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
+            )
+            encoder_key_length = (
+                self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+            )
            for model_class in self.all_model_classes:
                config.output_attentions = True
@@ -229,22 +250,20 @@ class TFCommonTestCases:
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                    encoder_seq_length,
+                )
-                    encoder_key_length])
                out_len = len(outputs)
                if self.is_encoder_decoder:
                    self.assertEqual(out_len % 2, 0)
-                    decoder_attentions = outputs[(out_len // 2)-1]
+                    decoder_attentions = outputs[(out_len // 2) - 1]
                    self.assertEqual(model.config.output_attentions, True)
                    self.assertEqual(model.config.output_hidden_states, False)
                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                    self.assertListEqual(
                        list(decoder_attentions[0].shape[-3:]),
-                        [self.model_tester.num_attention_heads,
+                        [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                         decoder_seq_length,
+                    )
-                         decoder_key_length])
                # Check attention is always last and order is fine
                config.output_attentions = True
@@ -259,9 +278,8 @@ class TFCommonTestCases:
                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                    encoder_seq_length,
+                )
-                    encoder_key_length])
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -276,8 +294,8 @@ class TFCommonTestCases:
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
+                    list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                )
        def test_model_common_attributes(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -307,13 +325,13 @@ class TFCommonTestCases:
            # We used to fall back to just synthetically creating a dummy tensor of ones:
            try:
                x = wte(input_ids, mode="embedding")
-            except:
+            except Exception:
                try:
                    x = wte([input_ids], mode="embedding")
-                except:
+                except Exception:
                    try:
                        x = wte([input_ids, None, None, None], mode="embedding")
-                    except:
+                    except Exception:
                        if hasattr(self.model_tester, "embedding_size"):
                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
                        else:
@@ -357,9 +375,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
-    output = tf.constant(values,
+    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
-                         shape=shape,
-                         dtype=dtype if dtype is not None else tf.int32)
    return output

--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -12,23 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import sys
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import CTRLConfig, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import CTRLConfig, is_tf_available
 if is_tf_available():
-    import tensorflow as tf
+    from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-    from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
-                                                TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
 @require_tf
@@ -37,32 +33,32 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
    class TFCTRLModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_labels=True,
+            use_labels=True,
-                     use_mc_token_ids=True,
+            use_mc_token_ids=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -127,13 +123,21 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFCTRLModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output = model(inputs)[0]
            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -145,30 +149,36 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                "sequence_output": sequence_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
        def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFCTRLLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            prediction_scores = model(inputs)[0]
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, head_mask, token_type_ids,
+            (
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
+                input_ids,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -192,6 +202,6 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
            model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -12,62 +12,70 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import DistilBertConfig, is_tf_available
 from .configuration_common_test import ConfigTester
-from .utils import CACHE_DIR, require_tf, slow
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
+from .utils import require_tf
-from transformers import DistilBertConfig, is_tf_available
 if is_tf_available():
-    import tensorflow as tf
+    from transformers.modeling_tf_distilbert import (
-    from transformers.modeling_tf_distilbert import (TFDistilBertModel,
+        TFDistilBertModel,
-                                                             TFDistilBertForMaskedLM,
+        TFDistilBertForMaskedLM,
-                                                             TFDistilBertForQuestionAnswering,
+        TFDistilBertForQuestionAnswering,
-                                                             TFDistilBertForSequenceClassification)
+        TFDistilBertForSequenceClassification,
+    )
 @require_tf
 class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
-    all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
+    all_model_classes = (
-                         TFDistilBertForSequenceClassification) if is_tf_available() else None
+        (
+            TFDistilBertModel,
+            TFDistilBertForMaskedLM,
+            TFDistilBertForQuestionAnswering,
+            TFDistilBertForSequenceClassification,
+        )
+        if is_tf_available()
+        else None
+    )
    test_pruning = True
    test_torchscript = True
    test_resize_embeddings = True
    test_head_masking = True
    class TFDistilBertModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=False,
+            use_token_type_ids=False,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -116,14 +124,16 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                dropout=self.hidden_dropout_prob,
                attention_dropout=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFDistilBertModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-                      'attention_mask': input_mask}
            outputs = model(inputs)
            sequence_output = outputs[0]
@@ -136,54 +146,51 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                "sequence_output": sequence_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
-        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFDistilBertForMaskedLM(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-                      'attention_mask': input_mask}
            (prediction_scores,) = model(inputs)
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFDistilBertForQuestionAnswering(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-                      'attention_mask': input_mask}
            start_logits, end_logits = model(inputs)
            result = {
                "start_logits": start_logits.numpy(),
                "end_logits": end_logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-                list(result["start_logits"].shape),
+            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].shape),
-                [self.batch_size, self.seq_length])
-        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFDistilBertForSequenceClassification(config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-                      'attention_mask': input_mask}
            (logits,) = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-                list(result["logits"].shape),
-                [self.batch_size, self.num_labels])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -215,5 +222,6 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
    #         self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -12,60 +12,60 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import sys
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import GPT2Config, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import GPT2Config, is_tf_available
 if is_tf_available():
    import tensorflow as tf
-    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
+    from transformers.modeling_tf_gpt2 import (
-                                                       TFGPT2DoubleHeadsModel,
+        TFGPT2Model,
-                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFGPT2LMHeadModel,
+        TFGPT2DoubleHeadsModel,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 @require_tf
 class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
-    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
+    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
-                         TFGPT2DoubleHeadsModel) if is_tf_available() else ()
    # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
    class TFGPT2ModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_labels=True,
+            use_labels=True,
-                     use_mc_token_ids=True,
+            use_mc_token_ids=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -130,13 +130,21 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFGPT2Model(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output = model(inputs)[0]
            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
@@ -148,54 +156,58 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                "sequence_output": sequence_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFGPT2LMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            prediction_scores = model(inputs)[0]
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_gpt2_double_head(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
            model = TFGPT2DoubleHeadsModel(config=config)
            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {'input_ids': multiple_choice_inputs_ids,
+            inputs = {
-                      'mc_token_ids': mc_token_ids,
+                "input_ids": multiple_choice_inputs_ids,
-                      'attention_mask': multiple_choice_input_mask,
+                "mc_token_ids": mc_token_ids,
-                      'token_type_ids': multiple_choice_token_type_ids}
+                "attention_mask": multiple_choice_input_mask,
-            lm_logits, mc_logits = model(inputs)[:2]
+                "token_type_ids": multiple_choice_token_type_ids,
-            result = {
-                "lm_logits": lm_logits.numpy(),
-                "mc_logits": mc_logits.numpy()
            }
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
            self.parent.assertListEqual(
-                list(result["lm_logits"].shape),
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
-                list(result["mc_logits"].shape),
-                [self.batch_size, self.num_choices])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, head_mask, token_type_ids,
+            (
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
+                input_ids,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -223,6 +235,6 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
            model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -12,59 +12,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-import sys
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import OpenAIGPTConfig, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import OpenAIGPTConfig, is_tf_available
 if is_tf_available():
    import tensorflow as tf
-    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+    from transformers.modeling_tf_openai import (
-                                                         TFOpenAIGPTDoubleHeadsModel,
+        TFOpenAIGPTModel,
-                                                         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 @require_tf
 class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
-    all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+    all_model_classes = (
-                         TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+    )
    class TFOpenAIGPTModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_labels=True,
+            use_labels=True,
-                     use_mc_token_ids=True,
+            use_mc_token_ids=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                     ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -129,13 +131,21 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+            return (
+                config,
+                input_ids,
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            )
        def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFOpenAIGPTModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output = model(inputs)[0]
            inputs = [input_ids, input_mask]
@@ -147,54 +157,58 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                "sequence_output": sequence_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
        def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = TFOpenAIGPTLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            prediction_scores = model(inputs)[0]
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_openai_gpt_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+        def create_and_check_openai_gpt_double_head(
+            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+        ):
            model = TFOpenAIGPTDoubleHeadsModel(config=config)
            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {'input_ids': multiple_choice_inputs_ids,
+            inputs = {
-                      'mc_token_ids': mc_token_ids,
+                "input_ids": multiple_choice_inputs_ids,
-                      'attention_mask': multiple_choice_input_mask,
+                "mc_token_ids": mc_token_ids,
-                      'token_type_ids': multiple_choice_token_type_ids}
+                "attention_mask": multiple_choice_input_mask,
-            lm_logits, mc_logits = model(inputs)[:2]
+                "token_type_ids": multiple_choice_token_type_ids,
-            result = {
-                "lm_logits": lm_logits.numpy(),
-                "mc_logits": mc_logits.numpy()
            }
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
            self.parent.assertListEqual(
-                list(result["lm_logits"].shape),
+                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            )
-            self.parent.assertListEqual(
+            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
-                list(result["mc_logits"].shape),
-                [self.batch_size, self.num_choices])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, head_mask, token_type_ids,
+            (
-             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
+                input_ids,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_mask,
+                head_mask,
+                token_type_ids,
+                mc_token_ids,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -222,6 +236,6 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -12,59 +12,62 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function
-from __future__ import division
-from __future__ import print_function
 import unittest
-from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from transformers import RobertaConfig, is_tf_available
 from .configuration_common_test import ConfigTester
+from .modeling_tf_common_test import TFCommonTestCases, ids_tensor
 from .utils import CACHE_DIR, require_tf, slow
-from transformers import RobertaConfig, is_tf_available
 if is_tf_available():
    import tensorflow as tf
    import numpy
-    from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
+    from transformers.modeling_tf_roberta import (
-                                                          TFRobertaForSequenceClassification,
+        TFRobertaModel,
-                                                          TFRobertaForTokenClassification,
+        TFRobertaForMaskedLM,
-                                                          TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
 @require_tf
 class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
-    all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
+    all_model_classes = (
-                         TFRobertaForSequenceClassification) if is_tf_available() else ()
+        (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else ()
+    )
    class TFRobertaModelTester(object):
+        def __init__(
-        def __init__(self,
+            self,
-                     parent,
+            parent,
-                     batch_size=13,
+            batch_size=13,
-                     seq_length=7,
+            seq_length=7,
-                     is_training=True,
+            is_training=True,
-                     use_input_mask=True,
+            use_input_mask=True,
-                     use_token_type_ids=True,
+            use_token_type_ids=True,
-                     use_labels=True,
+            use_labels=True,
-                     vocab_size=99,
+            vocab_size=99,
-                     hidden_size=32,
+            hidden_size=32,
-                     num_hidden_layers=5,
+            num_hidden_layers=5,
-                     num_attention_heads=4,
+            num_attention_heads=4,
-                     intermediate_size=37,
+            intermediate_size=37,
-                     hidden_act="gelu",
+            hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
+            max_position_embeddings=512,
-                     type_vocab_size=16,
+            type_vocab_size=16,
-                     type_sequence_label_size=2,
+            type_sequence_label_size=2,
-                     initializer_range=0.02,
+            initializer_range=0.02,
-                     num_labels=3,
+            num_labels=3,
-                     num_choices=4,
+            num_choices=4,
-                     scope=None,
+            scope=None,
-                    ):
+        ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -118,16 +121,16 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+            )
            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+        def create_and_check_roberta_model(
-                                           token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFRobertaModel(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
            sequence_output = model(inputs)[0]
            inputs = [input_ids, input_mask]
@@ -139,39 +142,47 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                "sequence_output": sequence_output.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-                [self.batch_size, self.seq_length, self.hidden_size])
+            )
-        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+        def create_and_check_roberta_for_masked_lm(
-                                                   token_labels, choice_labels):
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            model = TFRobertaForMaskedLM(config=config)
            prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
            result = {
                "prediction_scores": prediction_scores.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape),
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-                [self.batch_size, self.seq_length, self.vocab_size])
+            )
-        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_roberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
            config.num_labels = self.num_labels
            model = TFRobertaForTokenClassification(config=config)
-            inputs = {'input_ids': input_ids,
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-                      'attention_mask': input_mask,
+            (logits,) = model(inputs)
-                      'token_type_ids': token_type_ids}
-            logits, = model(inputs)
            result = {
                "logits": logits.numpy(),
            }
            self.parent.assertListEqual(
-                list(result["logits"].shape),
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-                [self.batch_size, self.seq_length, self.num_labels])
+            )
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
+            (
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
+                config,
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
            return config, inputs_dict
    def setUp(self):
@@ -196,61 +207,43 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
            self.assertIsNotNone(model)
 class TFRobertaModelIntegrationTest(unittest.TestCase):
    @slow
    def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
+        model = TFRobertaForMaskedLM.from_pretrained("roberta-base")
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = [1, 11, 50265]
-        self.assertEqual(
+        self.assertEqual(list(output.numpy().shape), expected_shape)
-            list(output.numpy().shape),
-            expected_shape
-        )
        # compare the actual values for a slice.
        expected_slice = tf.constant(
-            [[[33.8843, -4.3107, 22.7779],
+            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
        )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
    @slow
    def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained('roberta-base')
+        model = TFRobertaModel.from_pretrained("roberta-base")
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        # compare the actual values for a slice.
        expected_slice = tf.constant(
-            [[[-0.0231,  0.0782,  0.0074],
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
        )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
    @slow
    def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
-        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = [1, 3]
-        self.assertEqual(
+        self.assertEqual(list(output.numpy().shape), expected_shape)
-            list(output.numpy().shape),
+        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
-            expected_shape
+        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3))
-        )
-        expected_tensor = tf.constant([[-0.9469,  0.3913,  0.5118]])
-        self.assertTrue(
-            numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3)
-        )
 if __name__ == "__main__":